diff --git a/_build/backend.py b/_build/backend.py
index 9e40d0817ef8b7b950487f3cbfce0d2183f14255..c0077bc0a2ce49195c57d409f3bf58a3876d8fe5 100644
--- a/_build/backend.py
+++ b/_build/backend.py
@@ -328,7 +328,17 @@ def build_wheel(wheel_directory, config_settings=None, metadata_directory=None):
                     'manylinux_2_27_x86_64',
                     'manylinux_2_28_x86_64',
                     'manylinux_2_31_x86_64',
+                    'manylinux_2_34_x86_64',
+                    'manylinux_2_35_x86_64',
                     'linux_x86_64',
+                    'manylinux2014_aarch64',  # NB: equivalent to manylinux_2_17_aarch64
+                    'manylinux_2_24_aarch64',
+                    'manylinux_2_27_aarch64',
+                    'manylinux_2_28_aarch64',
+                    'manylinux_2_31_aarch64',
+                    'manylinux_2_34_aarch64',
+                    'manylinux_2_35_aarch64',
+                    'linux_aarch64',
                 ):
                     logging.info('----------------------------------------')
                     logging.info('Trying to delocate to platform: %s', plat)
diff --git a/ccsrc/include/CMakeLists.txt b/ccsrc/include/CMakeLists.txt
index 0e449bc8c91aabf2f0f498eade03ca1fc50f6a5b..0f805ed9e2d696dca5e01cd37fe60a406b41e73a 100644
--- a/ccsrc/include/CMakeLists.txt
+++ b/ccsrc/include/CMakeLists.txt
@@ -47,8 +47,14 @@ append_to_property(mq_install_targets GLOBAL include_lib)
 # ------------------------------------------------------------------------------
 
 install(
-  DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/config ${CMAKE_CURRENT_LIST_DIR}/core ${CMAKE_CURRENT_LIST_DIR}/ops
-            ${CMAKE_CURRENT_LIST_DIR}/simulator ${CMAKE_CURRENT_LIST_DIR}/device ${CMAKE_CURRENT_LIST_DIR}/math
+  DIRECTORY
+    ${CMAKE_CURRENT_LIST_DIR}/algorithm
+    ${CMAKE_CURRENT_LIST_DIR}/config
+    ${CMAKE_CURRENT_LIST_DIR}/core
+    ${CMAKE_CURRENT_LIST_DIR}/ops
+    ${CMAKE_CURRENT_LIST_DIR}/simulator
+    ${CMAKE_CURRENT_LIST_DIR}/device
+    ${CMAKE_CURRENT_LIST_DIR}/math
   DESTINATION ${MQ_INSTALL_INCLUDEDIR}/
   PATTERN "CPPLINT.cfg" EXCLUDE)
 
diff --git a/ccsrc/include/algorithm/qaia/csr_base.h b/ccsrc/include/algorithm/qaia/csr_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..c694701e2b38fa8680f3d61f7e97cfe1ed18f5f4
--- /dev/null
+++ b/ccsrc/include/algorithm/qaia/csr_base.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CSR_BASE_H
+#define CSR_BASE_H
+
+#include "core/mq_base_types.h"
+
+namespace mindquantum::sparse {
+template <typename T>
+struct CsrBase {
+    int dim_;
+    int nnz_;
+    Index *indptr_;
+    Index *indices_;
+    T *data_;
+
+    CsrBase() : dim_(0), nnz_(0), indptr_(nullptr), indices_(nullptr), data_(nullptr) {
+    }
+
+    CsrBase(int dim, int nnz, Index *indptr, Index *indices, T *data)
+        : dim_(dim), nnz_(nnz), indptr_(indptr), indices_(indices), data_(data) {
+    }
+};
+}  // namespace mindquantum::sparse
+
+#endif
diff --git a/ccsrc/include/algorithm/qaia/detail/common.cuh b/ccsrc/include/algorithm/qaia/detail/common.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6df3ceb1bc9254d22e198f72a6c66f80fda252f1
--- /dev/null
+++ b/ccsrc/include/algorithm/qaia/detail/common.cuh
@@ -0,0 +1,77 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+static void HandleError(cudaError_t err, const char* file, int line) {
+    if (err != cudaSuccess) {
+        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
+        exit(EXIT_FAILURE);
+    }
+    return;
+}
+#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
+
+#define CHECK_CUDA(func)                                                                                               \
+    {                                                                                                                  \
+        cudaError_t status = (func);                                                                                   \
+        if (status != cudaSuccess) {                                                                                   \
+            printf("CUDA API failed at line %d with error: %s (%d)\n", __LINE__, cudaGetErrorString(status), status);  \
+        }                                                                                                              \
+    }
+
+#define CHECK_CUSPARSE(func)                                                                                           \
+    {                                                                                                                  \
+        cusparseStatus_t status = (func);                                                                              \
+        if (status != CUSPARSE_STATUS_SUCCESS) {                                                                       \
+            printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, cusparseGetErrorString(status),   \
+                   status);                                                                                            \
+        }                                                                                                              \
+    }
+
+#define CHECK_CUBLAS(call)                                                                                             \
+    {                                                                                                                  \
+        cublasStatus_t err;                                                                                            \
+        if ((err = (call)) != CUBLAS_STATUS_SUCCESS) {                                                                 \
+            std::cerr << "CUBLAS error in file '" << __FILE__ << "' in line " << __LINE__ << ": "                      \
+                      << getCublasErrorString(err) << "." << std::endl;                                                \
+            std::exit(EXIT_FAILURE);                                                                                   \
+        }                                                                                                              \
+    }
+
+const char* getCublasErrorString(cublasStatus_t status) {
+    switch (status) {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+        default:
+            return "UNKNOWN CUBLAS ERROR";
+    }
+}
diff --git a/ccsrc/include/algorithm/qaia/detail/gpu_sb.cuh b/ccsrc/include/algorithm/qaia/detail/gpu_sb.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..62a5cff14dfc456227f811b2111de7296cb19c67
--- /dev/null
+++ b/ccsrc/include/algorithm/qaia/detail/gpu_sb.cuh
@@ -0,0 +1,47 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INCLUDE_QAIA_DETAIL_GPU_SB_CUH
+#define INCLUDE_QAIA_DETAIL_GPU_SB_CUH
+
+#include "algorithm/qaia/csr_base.h"
+#include "algorithm/qaia/detail/para.h"
+
+namespace mindquantum::algorithm::qaia::detail {
+
+struct SBBase {
+    static void dSB_update_int8(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras);
+
+    static void bSB_update_int8(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras);
+
+    static void bSB_update_fp16(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras);
+
+    static void dSB_update_fp16(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras);
+
+    static void dSB_update_h_int8(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras, double* h, int ndim);
+
+    static void bSB_update_h_int8(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras, double* h, int ndim);
+
+    static void bSB_update_h_fp16(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras, double* h, int ndim);
+
+    static void dSB_update_h_fp16(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras, double* h, int ndim);
+
+    static void cublas_warmup(int N, int B);
+};
+
+}  // namespace mindquantum::algorithm::qaia::detail
+
+#endif
diff --git a/ccsrc/include/algorithm/qaia/detail/para.h b/ccsrc/include/algorithm/qaia/detail/para.h
new file mode 100644
index 0000000000000000000000000000000000000000..956473829dbebeb0a126081eb6a452af24c856a0
--- /dev/null
+++ b/ccsrc/include/algorithm/qaia/detail/para.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PARA_H
+#define PARA_H
+
+namespace mindquantum::algorithm::qaia::detail {
+
+struct Para {
+    int B;
+    float xi;
+    float delta;
+    float dt;
+    int n_iter;
+
+    Para() : B(0), xi(0.0f), delta(0.0f), dt(0.0f), n_iter(0) {
+    }
+
+    Para(int B_, float xi_, float delta_, float dt_, int n_iter_)
+        : B(B_), xi(xi_), delta(delta_), dt(dt_), n_iter(n_iter_) {
+    }
+};
+
+}  // namespace mindquantum::algorithm::qaia::detail
+
+#endif
diff --git a/ccsrc/include/algorithm/qaia/detail/sb_helper.h b/ccsrc/include/algorithm/qaia/detail/sb_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..decc60325fb3e81ae7e39d0234199f0a08eaee3f
--- /dev/null
+++ b/ccsrc/include/algorithm/qaia/detail/sb_helper.h
@@ -0,0 +1,97 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SB_HELPER_H
+#define SB_HELPER_H
+
+#include <cuda_fp16.h>
+
+#include "algorithm/qaia/csr_base.h"
+#include "algorithm/qaia/detail/gpu_sb.cuh"
+#include "algorithm/qaia/detail/para.h"
+
+namespace mindquantum::algorithm::qaia::detail {
+
+// Helper structure, using template specialization to select different update functions
+template <int SB, typename T, bool H>
+struct SBUpdater;
+
+template <>
+struct SBUpdater<0, half, true> {
+    static void update(const mindquantum::sparse::CsrBase<double>& csr_matrix, double* raw_x,
+                       const mindquantum::algorithm::qaia::detail::Para& paras, double* raw_h, int ndim) {
+        mindquantum::algorithm::qaia::detail::SBBase::bSB_update_h_fp16(csr_matrix, raw_x, paras, raw_h, ndim);
+    }
+};
+
+template <>
+struct SBUpdater<0, int8_t, true> {
+    static void update(const mindquantum::sparse::CsrBase<double>& csr_matrix, double* raw_x,
+                       const mindquantum::algorithm::qaia::detail::Para& paras, double* raw_h, int ndim) {
+        mindquantum::algorithm::qaia::detail::SBBase::bSB_update_h_int8(csr_matrix, raw_x, paras, raw_h, ndim);
+    }
+};
+
+template <>
+struct SBUpdater<1, half, true> {
+    static void update(const mindquantum::sparse::CsrBase<double>& csr_matrix, double* raw_x,
+                       const mindquantum::algorithm::qaia::detail::Para& paras, double* raw_h, int ndim) {
+        mindquantum::algorithm::qaia::detail::SBBase::dSB_update_h_fp16(csr_matrix, raw_x, paras, raw_h, ndim);
+    }
+};
+
+template <>
+struct SBUpdater<1, int8_t, true> {
+    static void update(const mindquantum::sparse::CsrBase<double>& csr_matrix, double* raw_x,
+                       const mindquantum::algorithm::qaia::detail::Para& paras, double* raw_h, int ndim) {
+        mindquantum::algorithm::qaia::detail::SBBase::dSB_update_h_int8(csr_matrix, raw_x, paras, raw_h, ndim);
+    }
+};
+
+template <>
+struct SBUpdater<0, half, false> {
+    static void update(const mindquantum::sparse::CsrBase<double>& csr_matrix, double* raw_x,
+                       const mindquantum::algorithm::qaia::detail::Para& paras, double* raw_h, int ndim) {
+        mindquantum::algorithm::qaia::detail::SBBase::bSB_update_fp16(csr_matrix, raw_x, paras);
+    }
+};
+
+template <>
+struct SBUpdater<0, int8_t, false> {
+    static void update(const mindquantum::sparse::CsrBase<double>& csr_matrix, double* raw_x,
+                       const mindquantum::algorithm::qaia::detail::Para& paras, double* raw_h, int ndim) {
+        mindquantum::algorithm::qaia::detail::SBBase::bSB_update_int8(csr_matrix, raw_x, paras);
+    }
+};
+
+template <>
+struct SBUpdater<1, half, false> {
+    static void update(const mindquantum::sparse::CsrBase<double>& csr_matrix, double* raw_x,
+                       const mindquantum::algorithm::qaia::detail::Para& paras, double* raw_h, int ndim) {
+        mindquantum::algorithm::qaia::detail::SBBase::dSB_update_fp16(csr_matrix, raw_x, paras);
+    }
+};
+
+template <>
+struct SBUpdater<1, int8_t, false> {
+    static void update(const mindquantum::sparse::CsrBase<double>& csr_matrix, double* raw_x,
+                       const mindquantum::algorithm::qaia::detail::Para& paras, double* raw_h, int ndim) {
+        mindquantum::algorithm::qaia::detail::SBBase::dSB_update_int8(csr_matrix, raw_x, paras);
+    }
+};
+
+}  // namespace mindquantum::algorithm::qaia::detail
+#endif
diff --git a/ccsrc/include/algorithm/qaia/detail/tools.cuh b/ccsrc/include/algorithm/qaia/detail/tools.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..35e0be49e45ec5b929375b048e98cb10c679bd76
--- /dev/null
+++ b/ccsrc/include/algorithm/qaia/detail/tools.cuh
@@ -0,0 +1,216 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TOOLS_CUH
+#define TOOLS_CUH
+
+#include <curand_kernel.h>
+
+#include <vector>
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include "algorithm/qaia/csr_base.h"
+#include "core/mq_base_types.h"
+
+using mindquantum::Index;
+
+template <typename T>
+struct Bounds;
+
+// Specialization for int8_t
+template <>
+struct Bounds<int8_t> {
+    static __device__ int8_t UP() {
+        return 127;
+    }
+    static __device__ int8_t LOW() {
+        return -127;
+    }
+};
+
+// Specialization for half
+template <>
+struct Bounds<half> {
+    static __device__ half UP() {
+        return __float2half(1.0f);
+    }
+    static __device__ half LOW() {
+        return __float2half(-1.0f);
+    }
+};
+
+// Templated CUDA kernel functions
+template <typename T>
+__global__ void sign_kernel(const T* __restrict__ x, T* __restrict__ signx, int size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size)
+        return;
+
+    T zero = static_cast<T>(0);
+    if (x[idx] > zero) {
+        signx[idx] = Bounds<T>::UP();
+    } else if (x[idx] < zero) {
+        signx[idx] = Bounds<T>::LOW();
+    } else {
+        signx[idx] = zero;
+    }
+}
+
+#define UP  127
+#define LOW -127
+
+__global__ void update_tail_half(const half* __restrict__ tmp, half* x, half* y, int size, float xi, float beta,
+                                 float delta, float dt) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size)
+        return;
+    y[idx] += static_cast<half>(xi * static_cast<float>(tmp[idx]) + beta * static_cast<float>(x[idx]));
+    half x_idx = static_cast<half>(x[idx]) + y[idx] * static_cast<half>(delta * dt);
+    if (x_idx >= static_cast<half>(1.0)) {
+        x[idx] = 1.0;
+        y[idx] = 0;
+    } else if (x_idx <= static_cast<half>(-1.0)) {
+        x[idx] = -1.0;
+        y[idx] = 0;
+    } else {
+        x[idx] = x_idx;
+    }
+}
+
+__global__ void update_h_tail_half(const half* __restrict__ tmp, half* x, half* y, half* h, int size, float xi,
+                                   float beta, float delta, float dt) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size)
+        return;
+    y[idx] += static_cast<half>(xi * static_cast<float>(tmp[idx] + h[idx]) + beta * static_cast<float>(x[idx]));
+    half x_idx = static_cast<half>(x[idx]) + y[idx] * static_cast<half>(delta * dt);
+    if (x_idx >= static_cast<half>(1.0)) {
+        x[idx] = 1.0;
+        y[idx] = 0;
+    } else if (x_idx <= static_cast<half>(-1.0)) {
+        x[idx] = -1.0;
+        y[idx] = 0;
+    } else {
+        x[idx] = x_idx;
+    }
+}
+
+__global__ void update_tail(const int* __restrict__ tmp, int8_t* x, int* y, int size, float xi, float beta, float delta,
+                            float dt) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size)
+        return;
+    y[idx] += static_cast<int>(xi * static_cast<float>(tmp[idx] / 127) + beta * static_cast<float>(x[idx]));
+    int x_idx = static_cast<int>(x[idx]) + y[idx] * delta * dt;
+    if (x_idx >= UP) {
+        x[idx] = UP;
+        y[idx] = 0;
+    } else if (x_idx <= LOW) {
+        x[idx] = LOW;
+        y[idx] = 0;
+    } else {
+        x[idx] = x_idx;
+    }
+}
+
+__global__ void update_h_tail(const int* __restrict__ tmp, int8_t* x, int* y, int* h, int size, float xi, float beta,
+                              float delta, float dt) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size)
+        return;
+    y[idx] += static_cast<int>(xi * static_cast<float>((tmp[idx] + h[idx]) / 127) + beta * static_cast<float>(x[idx]));
+    int x_idx = static_cast<int>(x[idx]) + y[idx] * delta * dt;
+    if (x_idx >= UP) {
+        x[idx] = UP;
+        y[idx] = 0;
+    } else if (x_idx <= LOW) {
+        x[idx] = LOW;
+        y[idx] = 0;
+    } else {
+        x[idx] = x_idx;
+    }
+}
+
+// Base template (left undefined)
+template <typename T>
+__global__ void init_xy(T* array, int size, uint64_t seed);
+// Specialization for int
+template <>
+__global__ void init_xy<int>(int* array, int size, uint64_t seed) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        curandState state;
+        curand_init(seed, idx, 0, &state);
+        int randomValue = curand(&state) % 7 - 3;
+        array[idx] = randomValue;
+    }
+}
+// Specialization for int8_t
+template <>
+__global__ void init_xy<int8_t>(int8_t* array, int size, uint64_t seed) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        curandState state;
+        curand_init(seed, idx, 0, &state);
+        int randomValue = curand(&state) % 7 - 3;
+        array[idx] = static_cast<int8_t>(randomValue);
+    }
+}
+// Specialization for half
+template <>
+__global__ void init_xy<half>(half* array, int size, uint64_t seed) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        curandState state;
+        curand_init(seed, idx, 0, &state);
+        float rand_float = curand_uniform(&state) * 0.02f - 0.01f;
+        array[idx] = __float2half(rand_float);
+    }
+}
+
+template <typename T>
+struct Value;
+
+template <>
+struct Value<int8_t> {
+    static int8_t v(double value) {
+        return static_cast<int8_t>(value * 127.0);
+    }
+};
+
+template <>
+struct Value<half> {
+    static half v(double value) {
+        return static_cast<half>(value);
+    }
+};
+
+template <typename T>
+void fill_J(Index* indices, Index* indptr, double* data, std::vector<T>* h_J, int N) {
+    for (int i = 0; i < N; ++i) {
+        Index start = indptr[i];
+        Index end = indptr[i + 1];
+        for (Index j = start; j < end; ++j) {
+            Index col = indices[j];
+            double value = data[j];
+            (*h_J)[i * N + col] = Value<T>::v(value);
+        }
+    }
+}
+
+#endif
diff --git a/ccsrc/lib/CMakeLists.txt b/ccsrc/lib/CMakeLists.txt
index e4d688cbe239bec2708903b07ea1f8ac55fe90a6..bbbdf89bd4f356567377fdf3406bd0a8b0f67c3e 100644
--- a/ccsrc/lib/CMakeLists.txt
+++ b/ccsrc/lib/CMakeLists.txt
@@ -16,6 +16,7 @@
 #
 # ==============================================================================
 
+add_subdirectory(algorithm)
 add_subdirectory(mq_base)
 add_subdirectory(simulator)
 add_subdirectory(device)
diff --git a/ccsrc/lib/algorithm/CMakeLists.txt b/ccsrc/lib/algorithm/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3353242ced1eabdcfb7380b5e5b429ac72f27632
--- /dev/null
+++ b/ccsrc/lib/algorithm/CMakeLists.txt
@@ -0,0 +1,26 @@
+# ==============================================================================
+#
+# Copyright 2022 <Huawei Technologies Co., Ltd>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+if(ENABLE_CUDA)
+  add_library(qaia_sb_gpu STATIC)
+  target_link_libraries(qaia_sb_gpu PUBLIC CUDA_mindquantum mq_math cublas)
+  set_target_properties(qaia_sb_gpu PROPERTIES CUDA_RUNTIME_LIBRARY $<IF:$<BOOL:${CUDA_STATIC}>,Static,Shared>)
+  force_at_least_cxx17_workaround(qaia_sb_gpu)
+  append_to_property(mq_install_targets GLOBAL qaia_sb_gpu)
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/qaia/detail)
+endif()
diff --git a/ccsrc/lib/algorithm/qaia/detail/CMakeLists.txt b/ccsrc/lib/algorithm/qaia/detail/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..612e6cc98a27c3f533a7b988f5d1e158212621b1
--- /dev/null
+++ b/ccsrc/lib/algorithm/qaia/detail/CMakeLists.txt
@@ -0,0 +1,21 @@
+# ==============================================================================
+#
+# Copyright 2020 <Huawei Technologies Co., Ltd>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+# lint_cmake: -whitespace/indent
+
+target_sources(qaia_sb_gpu PRIVATE ${CMAKE_CURRENT_LIST_DIR}/gpu_sb_update.cu)
diff --git a/ccsrc/lib/algorithm/qaia/detail/gpu_sb_update.cu b/ccsrc/lib/algorithm/qaia/detail/gpu_sb_update.cu
new file mode 100644
index 0000000000000000000000000000000000000000..18f986ba662582693977024a0b41c1277bbdb8c0
--- /dev/null
+++ b/ccsrc/lib/algorithm/qaia/detail/gpu_sb_update.cu
@@ -0,0 +1,552 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cublas_v2.h>
+#include <curand_kernel.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <iostream>
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+
+#include "algorithm/qaia/csr_base.h"
+#include "algorithm/qaia/detail/common.cuh"
+#include "algorithm/qaia/detail/gpu_sb.cuh"
+#include "algorithm/qaia/detail/para.h"
+#include "algorithm/qaia/detail/tools.cuh"
+
+using mindquantum::Index;
+
+namespace mindquantum::algorithm::qaia::detail {
+
+void SBBase::dSB_update_int8(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras) {
+    Index* indptr = csr.indptr_;
+    Index* indices = csr.indices_;
+    double* data = csr.data_;
+    int N = csr.dim_;
+    int B = paras.B;
+    float xi = paras.xi;
+    float delta = paras.delta;
+    float dt = paras.dt;
+    int n_iter = paras.n_iter;
+    int NN = N * N;
+    int NB = N * B;
+
+    std::vector<int8_t> h_J(NN, 0);
+    std::vector<int8_t> h_x(NB);
+    std::vector<int> h_y(NB);
+
+    fill_J<int8_t>(indices, indptr, data, &h_J, N);
+
+    int8_t *d_J, *d_x, *signx;
+    int *d_y, *tmp;
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_J), NN * sizeof(int8_t)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_x), NB * sizeof(int8_t)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_y), NB * sizeof(int)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&tmp), NB * sizeof(int)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&signx), NB * sizeof(int8_t)));
+    HANDLE_ERROR(cudaMemcpy(d_J, h_J.data(), NN * sizeof(int8_t), cudaMemcpyHostToDevice));
+
+    init_xy<int8_t><<<(NB + 255) / 256, 256>>>(d_x, NB, time(NULL));
+    init_xy<int><<<(NB + 255) / 256, 256>>>(d_y, NB, time(NULL));
+
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    int alpha = 1;
+    int b = 0;
+    for (int i = 0; i < n_iter; i++) {
+        float beta = (n_iter == 1) ? -delta : static_cast<float>(i) / (n_iter - 1) - delta;
+
+        sign_kernel<int8_t><<<(NB + 255) / 256, 256>>>(d_x, signx, NB);
+
+        CHECK_CUBLAS(cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, B, N, &alpha, d_J, CUDA_R_8I, N, signx,
+                                  CUDA_R_8I, N, &b, tmp, CUDA_R_32I, N, CUDA_R_32I, CUBLAS_GEMM_ALGO15_TENSOR_OP));
+
+        update_tail<<<(NB + 255) / 256, 256>>>(tmp, d_x, d_y, NB, xi, beta, delta, dt);
+    }
+    CHECK_CUDA(cudaMemcpy(h_x.data(), d_x, NB * sizeof(int8_t), cudaMemcpyDeviceToHost))
+
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < B; j++) {
+            x[i * B + j] = static_cast<double>(h_x[j * N + i]) / 127.0;
+        }
+    }
+
+    HANDLE_ERROR(cudaFree(d_J));
+    HANDLE_ERROR(cudaFree(d_x));
+    HANDLE_ERROR(cudaFree(d_y));
+    HANDLE_ERROR(cudaFree(tmp));
+    HANDLE_ERROR(cudaFree(signx));
+}
+
+void SBBase::bSB_update_int8(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras) {
+    Index* indptr = csr.indptr_;
+    Index* indices = csr.indices_;
+    double* data = csr.data_;
+    int N = csr.dim_;
+    int B = paras.B;
+    float xi = paras.xi;
+    float delta = paras.delta;
+    float dt = paras.dt;
+    int n_iter = paras.n_iter;
+    int NN = N * N;
+    int NB = N * B;
+
+    std::vector<int8_t> h_J(NN, 0);
+    std::vector<int8_t> h_x(NB);
+    std::vector<int> h_y(NB);
+
+    fill_J<int8_t>(indices, indptr, data, &h_J, N);
+
+    int8_t *d_J, *d_x;
+    int *d_y, *tmp;
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_J), NN * sizeof(int8_t)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_x), NB * sizeof(int8_t)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_y), NB * sizeof(int)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&tmp), NB * sizeof(int)));
+    HANDLE_ERROR(cudaMemcpy(d_J, h_J.data(), NN * sizeof(int8_t), cudaMemcpyHostToDevice));
+
+    init_xy<int8_t><<<(NB + 255) / 256, 256>>>(d_x, NB, time(NULL));
+    init_xy<int><<<(NB + 255) / 256, 256>>>(d_y, NB, time(NULL));
+
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    int alpha = 1;
+    int b = 0;
+    for (int i = 0; i < n_iter; i++) {
+        float beta = (n_iter == 1) ? -delta : static_cast<float>(i) / (n_iter - 1) - delta;
+
+        CHECK_CUBLAS(cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, B, N, &alpha, d_J, CUDA_R_8I, N, d_x, CUDA_R_8I,
+                                  N, &b, tmp, CUDA_R_32I, N, CUDA_R_32I, CUBLAS_GEMM_ALGO15_TENSOR_OP));
+
+        update_tail<<<(NB + 255) / 256, 256>>>(tmp, d_x, d_y, NB, xi, beta, delta, dt);
+    }
+    CHECK_CUDA(cudaMemcpy(h_x.data(), d_x, NB * sizeof(int8_t), cudaMemcpyDeviceToHost))
+
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < B; j++) {
+            x[i * B + j] = static_cast<double>(h_x[j * N + i]) / 127.0;
+        }
+    }
+    HANDLE_ERROR(cudaFree(d_J));
+    HANDLE_ERROR(cudaFree(d_x));
+    HANDLE_ERROR(cudaFree(d_y));
+    HANDLE_ERROR(cudaFree(tmp));
+}
+
+void SBBase::bSB_update_fp16(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras) {
+    Index* indptr = csr.indptr_;
+    Index* indices = csr.indices_;
+    double* data = csr.data_;
+    int N = csr.dim_;
+    int B = paras.B;
+    float xi = paras.xi;
+    float delta = paras.delta;
+    float dt = paras.dt;
+    int n_iter = paras.n_iter;
+    int NN = N * N;
+    int NB = N * B;
+
+    std::vector<half> h_J(NN, 0);
+    std::vector<half> h_x(NB);
+    std::vector<half> h_y(NB);
+
+    fill_J<half>(indices, indptr, data, &h_J, N);
+
+    half *d_J, *d_x;
+    half *d_y, *tmp;
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_J), NN * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_x), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_y), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&tmp), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMemcpy(d_J, h_J.data(), NN * sizeof(half), cudaMemcpyHostToDevice));
+
+    init_xy<half><<<(NB + 255) / 256, 256>>>(d_x, NB, time(NULL));
+    init_xy<half><<<(NB + 255) / 256, 256>>>(d_y, NB, time(NULL));
+
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    // cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
+    half b = 0.0;
+    half alpha = 1.0;
+    for (int i = 0; i < n_iter; i++) {
+        float beta = (n_iter == 1) ? -delta : static_cast<float>(i) / (n_iter - 1) - delta;
+
+        CHECK_CUBLAS(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, B, N, N, &alpha, d_x, CUDA_R_16F, B, d_J,
+                                  CUDA_R_16F, N, &b, tmp, CUDA_R_16F, B, CUBLAS_COMPUTE_16F,
+                                  CUBLAS_GEMM_ALGO15_TENSOR_OP));
+
+        update_tail_half<<<(NB + 255) / 256, 256>>>(tmp, d_x, d_y, NB, xi, beta, delta, dt);
+    }
+    CHECK_CUDA(cudaMemcpy(h_x.data(), d_x, NB * sizeof(half), cudaMemcpyDeviceToHost))
+
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < B; j++) {
+            x[i * B + j] = static_cast<double>(h_x[i * B + j]);
+        }
+    }
+
+    HANDLE_ERROR(cudaFree(d_J));
+    HANDLE_ERROR(cudaFree(d_x));
+    HANDLE_ERROR(cudaFree(d_y));
+    HANDLE_ERROR(cudaFree(tmp));
+}
+
+void SBBase::dSB_update_fp16(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras) {
+    Index* indptr = csr.indptr_;
+    Index* indices = csr.indices_;
+    double* data = csr.data_;
+    int N = csr.dim_;
+    int B = paras.B;
+    float xi = paras.xi;
+    float delta = paras.delta;
+    float dt = paras.dt;
+    int n_iter = paras.n_iter;
+    int NN = N * N;
+    int NB = N * B;
+
+    std::vector<half> h_J(NN, 0);
+    std::vector<half> h_x(NB);
+    std::vector<half> h_y(NB);
+
+    fill_J<half>(indices, indptr, data, &h_J, N);
+
+    half *d_J, *d_x;
+    half *d_y, *tmp, *signx;
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_J), NN * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_x), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_y), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&tmp), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&signx), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMemcpy(d_J, h_J.data(), NN * sizeof(half), cudaMemcpyHostToDevice));
+
+    init_xy<half><<<(NB + 255) / 256, 256>>>(d_x, NB, time(NULL));
+    init_xy<half><<<(NB + 255) / 256, 256>>>(d_y, NB, time(NULL));
+
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    half b = 0.0;
+    half alpha = 1.0;
+    for (int i = 0; i < n_iter; i++) {
+        float beta = (n_iter == 1) ? -delta : static_cast<float>(i) / (n_iter - 1) - delta;
+
+        sign_kernel<half><<<(NB + 255) / 256, 256>>>(d_x, signx, NB);
+
+        CHECK_CUBLAS(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, B, N, N, &alpha, signx, CUDA_R_16F, B, d_J,
+                                  CUDA_R_16F, N, &b, tmp, CUDA_R_16F, B, CUBLAS_COMPUTE_16F,
+                                  CUBLAS_GEMM_ALGO15_TENSOR_OP));
+
+        update_tail_half<<<(NB + 255) / 256, 256>>>(tmp, d_x, d_y, NB, xi, beta, delta, dt);
+    }
+    CHECK_CUDA(cudaMemcpy(h_x.data(), d_x, NB * sizeof(half), cudaMemcpyDeviceToHost))
+
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < B; j++) {
+            x[i * B + j] = static_cast<double>(h_x[i * B + j]);
+        }
+    }
+    HANDLE_ERROR(cudaFree(d_J));
+    HANDLE_ERROR(cudaFree(d_x));
+    HANDLE_ERROR(cudaFree(d_y));
+    HANDLE_ERROR(cudaFree(tmp));
+    HANDLE_ERROR(cudaFree(signx));
+}
+
+void SBBase::dSB_update_h_int8(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras, double* h, int ndim) {
+    Index* indptr = csr.indptr_;
+    Index* indices = csr.indices_;
+    double* data = csr.data_;
+    int N = csr.dim_;
+    int B = paras.B;
+    float xi = paras.xi;
+    float delta = paras.delta;
+    float dt = paras.dt;
+    int n_iter = paras.n_iter;
+    int NN = N * N;
+    int NB = N * B;
+
+    std::vector<int8_t> h_J(NN, 0);
+    std::vector<int8_t> h_x(NB);
+    std::vector<int> h_y(NB);
+
+    fill_J<int8_t>(indices, indptr, data, &h_J, N);
+
+    int8_t *d_J, *d_x, *signx;
+    int *d_y, *tmp, *d_h;
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_J), NN * sizeof(int8_t)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_x), NB * sizeof(int8_t)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_y), NB * sizeof(int)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&tmp), NB * sizeof(int)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&signx), NB * sizeof(int8_t)));
+    HANDLE_ERROR(cudaMemcpy(d_J, h_J.data(), NN * sizeof(int8_t), cudaMemcpyHostToDevice));
+    std::vector<int> h_h(NB, 0);
+    for (int i = 0; i < NB; i++) {
+        h_h[i] = static_cast<int>(h[i] * 127);
+    }
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_h), NB * sizeof(int)));
+    HANDLE_ERROR(cudaMemcpy(d_h, h_h.data(), NB * sizeof(int), cudaMemcpyHostToDevice));
+
+    init_xy<int8_t><<<(NB + 255) / 256, 256>>>(d_x, NB, time(NULL));
+    init_xy<int><<<(NB + 255) / 256, 256>>>(d_y, NB, time(NULL));
+
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    int alpha = 1;
+    int b = 0;
+    for (int i = 0; i < n_iter; i++) {
+        float beta = (n_iter == 1) ? -delta : static_cast<float>(i) / (n_iter - 1) - delta;
+
+        sign_kernel<int8_t><<<(NB + 255) / 256, 256>>>(d_x, signx, NB);
+
+        CHECK_CUBLAS(cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, B, N, &alpha, d_J, CUDA_R_8I, N, signx,
+                                  CUDA_R_8I, N, &b, tmp, CUDA_R_32I, N, CUDA_R_32I, CUBLAS_GEMM_ALGO15_TENSOR_OP));
+
+        update_h_tail<<<(NB + 255) / 256, 256>>>(tmp, d_x, d_y, d_h, NB, xi, beta, delta, dt);
+    }
+    CHECK_CUDA(cudaMemcpy(h_x.data(), d_x, NB * sizeof(int8_t), cudaMemcpyDeviceToHost))
+
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < B; j++) {
+            x[i * B + j] = static_cast<double>(h_x[j * N + i]) / 127.0;
+        }
+    }
+    HANDLE_ERROR(cudaFree(d_h));
+    HANDLE_ERROR(cudaFree(d_J));
+    HANDLE_ERROR(cudaFree(d_x));
+    HANDLE_ERROR(cudaFree(d_y));
+    HANDLE_ERROR(cudaFree(tmp));
+    HANDLE_ERROR(cudaFree(signx));
+}
+
+void SBBase::bSB_update_h_int8(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras, double* h, int ndim) {
+    Index* indptr = csr.indptr_;
+    Index* indices = csr.indices_;
+    double* data = csr.data_;
+    int N = csr.dim_;
+    int B = paras.B;
+    float xi = paras.xi;
+    float delta = paras.delta;
+    float dt = paras.dt;
+    int n_iter = paras.n_iter;
+    int NN = N * N;
+    int NB = N * B;
+
+    std::vector<int8_t> h_J(NN, 0);
+    std::vector<int8_t> h_x(NB);
+    std::vector<int> h_y(NB);
+
+    fill_J<int8_t>(indices, indptr, data, &h_J, N);
+
+    int8_t *d_J, *d_x;
+    int *d_y, *tmp, *d_h;
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_J), NN * sizeof(int8_t)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_x), NB * sizeof(int8_t)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_y), NB * sizeof(int)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&tmp), NB * sizeof(int)));
+    HANDLE_ERROR(cudaMemcpy(d_J, h_J.data(), NN * sizeof(int8_t), cudaMemcpyHostToDevice));
+    std::vector<int> h_h(NB, 0);
+    for (int i = 0; i < NB; i++) {
+        h_h[i] = static_cast<int>(data[i] * 127);
+    }
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_h), NB * sizeof(int)));
+    HANDLE_ERROR(cudaMemcpy(d_h, h_h.data(), NB * sizeof(int), cudaMemcpyHostToDevice));
+
+    init_xy<int8_t><<<(NB + 255) / 256, 256>>>(d_x, NB, time(NULL));
+    init_xy<int><<<(NB + 255) / 256, 256>>>(d_y, NB, time(NULL));
+
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    int alpha = 1;
+    int b = 0;
+    for (int i = 0; i < n_iter; i++) {
+        float beta = (n_iter == 1) ? -delta : static_cast<float>(i) / (n_iter - 1) - delta;
+
+        CHECK_CUBLAS(cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, B, N, &alpha, d_J, CUDA_R_8I, N, d_x, CUDA_R_8I,
+                                  N, &b, tmp, CUDA_R_32I, N, CUDA_R_32I, CUBLAS_GEMM_ALGO15_TENSOR_OP));
+
+        update_h_tail<<<(NB + 255) / 256, 256>>>(tmp, d_x, d_y, d_h, NB, xi, beta, delta, dt);
+    }
+    CHECK_CUDA(cudaMemcpy(h_x.data(), d_x, NB * sizeof(int8_t), cudaMemcpyDeviceToHost))
+
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < B; j++) {
+            x[i * B + j] = static_cast<double>(h_x[j * N + i]) / 127.0;
+        }
+    }
+
+    HANDLE_ERROR(cudaFree(d_h));
+    HANDLE_ERROR(cudaFree(d_J));
+    HANDLE_ERROR(cudaFree(d_x));
+    HANDLE_ERROR(cudaFree(d_y));
+    HANDLE_ERROR(cudaFree(tmp));
+}
+
+void SBBase::bSB_update_h_fp16(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras, double* h, int ndim) {
+    Index* indptr = csr.indptr_;
+    Index* indices = csr.indices_;
+    double* data = csr.data_;
+    int N = csr.dim_;
+    int B = paras.B;
+    float xi = paras.xi;
+    float delta = paras.delta;
+    float dt = paras.dt;
+    int n_iter = paras.n_iter;
+    int NN = N * N;
+    int NB = N * B;
+
+    std::vector<half> h_J(NN, 0);
+    std::vector<half> h_x(NB);
+    std::vector<half> h_y(NB);
+
+    fill_J<half>(indices, indptr, data, &h_J, N);
+
+    half *d_J, *d_x;
+    half *d_y, *tmp, *d_h;
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_J), NN * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_x), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_y), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&tmp), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMemcpy(d_J, h_J.data(), NN * sizeof(half), cudaMemcpyHostToDevice));
+    std::vector<half> h_h(NB, 0);
+    for (int i = 0; i < NB; i++) {
+        h_h[i] = static_cast<half>(h[i]);
+    }
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_h), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMemcpy(d_h, h_h.data(), NB * sizeof(half), cudaMemcpyHostToDevice));
+
+    init_xy<half><<<(NB + 255) / 256, 256>>>(d_x, NB, time(NULL));
+    init_xy<half><<<(NB + 255) / 256, 256>>>(d_y, NB, time(NULL));
+
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    // cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
+    half b = 0.0;
+    half alpha = 1.0;
+    for (int i = 0; i < n_iter; i++) {
+        float beta = (n_iter == 1) ? -delta : static_cast<float>(i) / (n_iter - 1) - delta;
+
+        CHECK_CUBLAS(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, B, N, N, &alpha, d_x, CUDA_R_16F, B, d_J,
+                                  CUDA_R_16F, N, &b, tmp, CUDA_R_16F, B, CUBLAS_COMPUTE_16F,
+                                  CUBLAS_GEMM_ALGO15_TENSOR_OP));
+
+        update_h_tail_half<<<(NB + 255) / 256, 256>>>(tmp, d_x, d_y, d_h, NB, xi, beta, delta, dt);
+    }
+    CHECK_CUDA(cudaMemcpy(h_x.data(), d_x, NB * sizeof(half), cudaMemcpyDeviceToHost))
+
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < B; j++) {
+            x[i * B + j] = static_cast<double>(h_x[i * B + j]);
+        }
+    }
+
+    HANDLE_ERROR(cudaFree(d_h));
+    HANDLE_ERROR(cudaFree(d_J));
+    HANDLE_ERROR(cudaFree(d_x));
+    HANDLE_ERROR(cudaFree(d_y));
+    HANDLE_ERROR(cudaFree(tmp));
+}
+
+void SBBase::dSB_update_h_fp16(mindquantum::sparse::CsrBase<double> csr, double* x, Para paras, double* h, int ndim) {
+    Index* indptr = csr.indptr_;
+    Index* indices = csr.indices_;
+    double* data = csr.data_;
+    int N = csr.dim_;
+    int B = paras.B;
+    float xi = paras.xi;
+    float delta = paras.delta;
+    float dt = paras.dt;
+    int n_iter = paras.n_iter;
+    int NN = N * N;
+    int NB = N * B;
+
+    std::vector<half> h_J(NN, 0);
+    std::vector<half> h_x(NB);
+    std::vector<half> h_y(NB);
+
+    fill_J<half>(indices, indptr, data, &h_J, N);
+
+    half *d_J, *d_x;
+    half *d_y, *tmp, *d_h, *signx;
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_J), NN * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_x), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_y), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&tmp), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&signx), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMemcpy(d_J, h_J.data(), NN * sizeof(half), cudaMemcpyHostToDevice));
+    std::vector<half> h_h(NB, 0);
+    for (int i = 0; i < NB; i++) {
+        h_h[i] = static_cast<half>(h[i]);
+    }
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_h), NB * sizeof(half)));
+    HANDLE_ERROR(cudaMemcpy(d_h, h_h.data(), NB * sizeof(half), cudaMemcpyHostToDevice));
+
+    init_xy<half><<<(NB + 255) / 256, 256>>>(d_x, NB, time(NULL));
+    init_xy<half><<<(NB + 255) / 256, 256>>>(d_y, NB, time(NULL));
+
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    half b = 0.0;
+    half alpha = 1.0;
+    for (int i = 0; i < n_iter; i++) {
+        float beta = (n_iter == 1) ? -delta : static_cast<float>(i) / (n_iter - 1) - delta;
+
+        sign_kernel<half><<<(NB + 255) / 256, 256>>>(d_x, signx, NB);
+
+        CHECK_CUBLAS(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, B, N, N, &alpha, signx, CUDA_R_16F, B, d_J,
+                                  CUDA_R_16F, N, &b, tmp, CUDA_R_16F, B, CUBLAS_COMPUTE_16F,
+                                  CUBLAS_GEMM_ALGO15_TENSOR_OP));
+
+        update_h_tail_half<<<(NB + 255) / 256, 256>>>(tmp, d_x, d_y, d_h, NB, xi, beta, delta, dt);
+    }
+    CHECK_CUDA(cudaMemcpy(h_x.data(), d_x, NB * sizeof(half), cudaMemcpyDeviceToHost))
+
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < B; j++) {
+            x[i * B + j] = static_cast<double>(h_x[i * B + j]);
+        }
+    }
+
+    HANDLE_ERROR(cudaFree(d_h));
+    HANDLE_ERROR(cudaFree(d_J));
+    HANDLE_ERROR(cudaFree(d_x));
+    HANDLE_ERROR(cudaFree(d_y));
+    HANDLE_ERROR(cudaFree(tmp));
+    HANDLE_ERROR(cudaFree(signx));
+}
+
+void SBBase::cublas_warmup(int N, int B) {
+    int8_t *d_J, *d_x;
+    int* tmp;
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_J), N * N * sizeof(int8_t)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&d_x), N * B * sizeof(int8_t)));
+    HANDLE_ERROR(cudaMalloc(reinterpret_cast<void**>(&tmp), N * B * sizeof(int)));
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    int alpha = 1;
+    int beta = 0;
+    for (int i = 0; i < 100; i++) {
+        CHECK_CUBLAS(cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, B, N, &alpha, d_J, CUDA_R_8I, N, d_x, CUDA_R_8I,
+                                  N, &beta, tmp, CUDA_R_32I, N, CUDA_R_32I, CUBLAS_GEMM_ALGO15_TENSOR_OP));
+    }
+    HANDLE_ERROR(cudaFree(d_J));
+    HANDLE_ERROR(cudaFree(d_x));
+    HANDLE_ERROR(cudaFree(tmp));
+}
+
+}  // namespace mindquantum::algorithm::qaia::detail
diff --git a/ccsrc/python/CMakeLists.txt b/ccsrc/python/CMakeLists.txt
index 1cea0d7bf2556d75840cddb57f026b2f5b30de5b..b2fa5403da5d6b53ff5cdebf8b02100ba2d0b910 100644
--- a/ccsrc/python/CMakeLists.txt
+++ b/ccsrc/python/CMakeLists.txt
@@ -20,6 +20,7 @@ add_subdirectory(core)
 
 # ==============================================================================
 
+add_subdirectory(algorithm)
 add_subdirectory(mqbackend)
 add_subdirectory(simulator)
 add_subdirectory(device)
diff --git a/ccsrc/python/algorithm/CMakeLists.txt b/ccsrc/python/algorithm/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..66da5c23b0a27b4c4cfa06be39b79ab4f172e687
--- /dev/null
+++ b/ccsrc/python/algorithm/CMakeLists.txt
@@ -0,0 +1,26 @@
+# ==============================================================================
+#
+# Copyright 2022 <Huawei Technologies Co., Ltd>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+if(ENABLE_CUDA)
+  pybind11_add_module(_qaia_sb MODULE
+                      ${CMAKE_CURRENT_SOURCE_DIR}/lib/_qaia_sb.cu
+                      OUTPUT_HINT "${MQ_PYTHON_PACKAGE_NAME}")
+  force_at_least_cxx17_workaround(_qaia_sb)
+  target_link_libraries(_qaia_sb PUBLIC mq_python_core qaia_sb_gpu)
+  set_target_properties(_qaia_sb PROPERTIES CUDA_RUNTIME_LIBRARY $<IF:$<BOOL:${CUDA_STATIC}>,Static,Shared>)
+endif()
diff --git a/ccsrc/python/algorithm/lib/_qaia_sb.cu b/ccsrc/python/algorithm/lib/_qaia_sb.cu
new file mode 100644
index 0000000000000000000000000000000000000000..de5cbc44bfb97e283223beed742d6dfa9c71d692
--- /dev/null
+++ b/ccsrc/python/algorithm/lib/_qaia_sb.cu
@@ -0,0 +1,80 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <memory>
+
+#include <cuda_fp16.h>
+
+#include <fmt/format.h>
+#include <pybind11/attr.h>
+#include <pybind11/cast.h>
+#include <pybind11/numpy.h>
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+
+#include "algorithm/qaia/csr_base.h"
+#include "algorithm/qaia/detail/gpu_sb.cuh"
+#include "algorithm/qaia/detail/para.h"
+#include "algorithm/qaia/detail/sb_helper.h"
+#include "config/constexpr_type_name.h"
+#include "config/format/std_complex.h"
+#include "config/type_traits.h"
+#include "core/mq_base_types.h"
+
+namespace py = pybind11;
+using namespace pybind11::literals;  // NOLINT(build/namespaces_literals)
+using mindquantum::Index;
+
+template <int SB, typename T, bool H>
+void sb_update(const py::object& csr, const py::array_t<double>& x, const py::array_t<double>& h, int B, float xi,
+               float delta, float dt, int n_iter) {
+    auto indices = csr.attr("indices").cast<py::array_t<Index>>();
+    auto indptr = csr.attr("indptr").cast<py::array_t<Index>>();
+    auto data = csr.attr("data").cast<py::array_t<double>>();
+    int nnz = data.size();
+    Index* raw_indices = static_cast<Index*>(indices.request().ptr);
+    Index* raw_indptr = static_cast<Index*>(indptr.request().ptr);
+    double* raw_data = static_cast<double*>(data.request().ptr);
+    auto shape = csr.attr("shape").cast<py::tuple>();
+    int nrows = shape[0].cast<int>();
+    int ncols = shape[1].cast<int>();
+    int N = nrows;
+    mindquantum::sparse::CsrBase<double> csr_matrix(N, nnz, raw_indptr, raw_indices, raw_data);
+
+    double* raw_x = static_cast<double*>(x.request().ptr);
+    mindquantum::algorithm::qaia::detail::Para paras(B, xi, delta, dt, n_iter);
+
+    double* raw_h = static_cast<double*>(h.request().ptr);
+    int ndim = h.ndim();
+
+    mindquantum::algorithm::qaia::detail::SBUpdater<SB, T, H>::update(csr_matrix, raw_x, paras, raw_h, ndim);
+}
+
+PYBIND11_MODULE(_qaia_sb, module) {
+    module.def("cuda_init", mindquantum::algorithm::qaia::detail::SBBase::cublas_warmup, "warmup cuBLAS");
+    module.def("bsb_update_h_int8", &sb_update<0, int8_t, true>, "BSB update func(int8_t) with h");
+    module.def("bsb_update_h_half", &sb_update<0, half, true>, "BSB update func(half) with h");
+    module.def("dsb_update_h_int8", &sb_update<1, int8_t, true>, "DSB update func(int8_t) with h");
+    module.def("dsb_update_h_half", &sb_update<1, half, true>, "DSB update func(half) with h");
+
+    module.def("bsb_update_int8", &sb_update<0, int8_t, false>, "BSB update func(int8_t) without h");
+    module.def("bsb_update_half", &sb_update<0, half, false>, "BSB update func(half) without h");
+    module.def("dsb_update_int8", &sb_update<1, int8_t, false>, "DSB update func(int8_t) without h");
+    module.def("dsb_update_half", &sb_update<1, half, false>, "DSB update func(half) without h");
+}
diff --git a/docs/api_python/algorithm/compiler/mindquantum.algorithm.compiler.cnry_decompose.rst b/docs/api_python/algorithm/compiler/mindquantum.algorithm.compiler.cnry_decompose.rst
new file mode 100644
index 0000000000000000000000000000000000000000..672cb844f33851d81ef56c760481b05f6d3a9313
--- /dev/null
+++ b/docs/api_python/algorithm/compiler/mindquantum.algorithm.compiler.cnry_decompose.rst
@@ -0,0 +1,12 @@
+mindquantum.algorithm.compiler.cnry_decompose
+=============================================
+
+.. py:function:: mindquantum.algorithm.compiler.cnry_decompose(gate: gates.RY)
+
+    分解一个受控的 :class:`~.core.gates.RY` 门。
+
+    参数：
+        - **gate** (:class:`~.core.gates.RY`) - 有零或多个控制位的 :class:`~.core.gates.RY` 门。
+
+    返回：
+        List[:class:`~.core.circuit.Circuit`]，可能的分解方式。
diff --git a/docs/api_python/algorithm/qaia/mindquantum.algorithm.qaia.BSB.rst b/docs/api_python/algorithm/qaia/mindquantum.algorithm.qaia.BSB.rst
index 13396cd74605b60001517b0229a9d742e8a88f41..a5d5b9eff840d7853f6c476fcfdc2f7ed2320b27 100644
--- a/docs/api_python/algorithm/qaia/mindquantum.algorithm.qaia.BSB.rst
+++ b/docs/api_python/algorithm/qaia/mindquantum.algorithm.qaia.BSB.rst
@@ -1,7 +1,7 @@
 mindquantum.algorithm.qaia.BSB
 ===============================
 
-.. py:class:: mindquantum.algorithm.qaia.BSB(J, h=None, x=None, n_iter=1000, batch_size=1, dt=1, xi=None)
+.. py:class:: mindquantum.algorithm.qaia.BSB(J, h=None, x=None, n_iter=1000, batch_size=1, dt=1, xi=None, device='cpu', precision='float32')
 
     弹道模拟分叉算法。
 
@@ -12,9 +12,11 @@ mindquantum.algorithm.qaia.BSB
         - **h** (numpy.array) - 外场强度，维度为 :math:`(N, )`。
         - **x** (numpy.array) - 自旋初始化配置，维度为 :math:`(N x batch_size)`。默认值： ``None``。
         - **n_iter** (int) - 迭代步数。默认值： ``1000``。
-        - **batch_size** (int) - 样本个数。默认值为： ``1``。
-        - **dt** (float) - 迭代步长。默认值： ``0.1``。
+        - **batch_size** (int) - 样本个数。默认值： ``1``。
+        - **dt** (float) - 迭代步长。默认值： ``1``。
         - **xi** (float) - 频率维数，正的常数。默认值： ``None``。
+        - **device** (str) - 计算设备（'cpu'或'gpu'）。默认值： ``'cpu'``。
+        - **precision** (str) - 使用GPU时的精度类型（'float32'、'float16'或'int8'）。默认值： ``'float32'``。
 
     .. py:method:: update()
 
diff --git a/docs/api_python/algorithm/qaia/mindquantum.algorithm.qaia.DSB.rst b/docs/api_python/algorithm/qaia/mindquantum.algorithm.qaia.DSB.rst
index e5463048b601f97d9fae6fdd285ac7472dabfd30..6695fd54de22a3e451a8ef6c5e044c084fb509df 100644
--- a/docs/api_python/algorithm/qaia/mindquantum.algorithm.qaia.DSB.rst
+++ b/docs/api_python/algorithm/qaia/mindquantum.algorithm.qaia.DSB.rst
@@ -1,7 +1,7 @@
 mindquantum.algorithm.qaia.DSB
 ===============================
 
-.. py:class:: mindquantum.algorithm.qaia.DSB(J, h=None, x=None, n_iter=1000, batch_size=1, dt=1, xi=None)
+.. py:class:: mindquantum.algorithm.qaia.DSB(J, h=None, x=None, n_iter=1000, batch_size=1, dt=1, xi=None, device='cpu', precision='float32')
 
     离散模拟分叉算法。
 
@@ -12,9 +12,11 @@ mindquantum.algorithm.qaia.DSB
         - **h** (numpy.array) - 外场强度，维度为 :math:`(N, )`。
         - **x** (numpy.array) - 自旋初始化配置，维度为 :math:`(N x batch_size)`。默认值： ``None``。
         - **n_iter** (int) - 迭代步数。默认值： ``1000``。
-        - **batch_size** (int) - 样本个数。默认值为： ``1``。
-        - **dt** (float) - 迭代步长。默认值： ``0.1``。
+        - **batch_size** (int) - 样本个数。默认值： ``1``。
+        - **dt** (float) - 迭代步长。默认值： ``1``。
         - **xi** (float) - 频率维数，正的常数。默认值： ``None``。
+        - **device** (str) - 计算设备（'cpu' 或 'gpu'）。默认值： ``'cpu'``。
+        - **precision** (str) - 使用 GPU 时的精度类型（'float32'、'float16'、'int8'）。默认值： ``'float32'``。
 
     .. py:method:: update()
 
diff --git a/docs/api_python_en/algorithm/mindquantum.algorithm.compiler.rst b/docs/api_python_en/algorithm/mindquantum.algorithm.compiler.rst
index 8098cfe59bfc066aaa8897580bfb3583875d02b4..4db7a8197e69cb22ae4fe17a2b21fc1225109b2b 100644
--- a/docs/api_python_en/algorithm/mindquantum.algorithm.compiler.rst
+++ b/docs/api_python_en/algorithm/mindquantum.algorithm.compiler.rst
@@ -16,6 +16,7 @@ Fixed decompose rules
     mindquantum.algorithm.compiler.rxx_decompose
     mindquantum.algorithm.compiler.crxx_decompose
     mindquantum.algorithm.compiler.cry_decompose
+    mindquantum.algorithm.compiler.cnry_decompose
     mindquantum.algorithm.compiler.ryy_decompose
     mindquantum.algorithm.compiler.cryy_decompose
     mindquantum.algorithm.compiler.rzz_decompose
diff --git a/mindquantum/algorithm/compiler/decompose/fixed_decompose/__init__.py b/mindquantum/algorithm/compiler/decompose/fixed_decompose/__init__.py
index 713d5c93bb601e8c16311639e3c9f0ca4c04b434..6330de08464feb3ea638cfddbb343770037005ea 100644
--- a/mindquantum/algorithm/compiler/decompose/fixed_decompose/__init__.py
+++ b/mindquantum/algorithm/compiler/decompose/fixed_decompose/__init__.py
@@ -33,7 +33,7 @@ from . import (
 from .h_related import ch_decompose
 from .rx_related import crx_decompose
 from .rxx_related import crxx_decompose, rxx_decompose
-from .ry_related import cry_decompose
+from .ry_related import cry_decompose, cnry_decompose
 from .ryy_related import cryy_decompose, ryy_decompose
 from .rz_related import crz_decompose
 from .rzz_related import rzz_decompose
diff --git a/mindquantum/algorithm/compiler/decompose/fixed_decompose/ry_related.py b/mindquantum/algorithm/compiler/decompose/fixed_decompose/ry_related.py
index 316f0a9d59373ec1c4bef60ada679805f6a7ba70..c2c78078d6a1726931fc0d444ffd74430796669f 100644
--- a/mindquantum/algorithm/compiler/decompose/fixed_decompose/ry_related.py
+++ b/mindquantum/algorithm/compiler/decompose/fixed_decompose/ry_related.py
@@ -60,6 +60,74 @@ def cry_decompose(gate: gates.RY):
     circuit += gates.X.on(target, control)
     return [circuit]
 
+def _cnry_frame(ctrl_qubits, obj_qubit, angle):
+    """The function used to construct iteratively the circuit of the cnry_decompose compiler.
 
-decompose_rules = ['cry_decompose']
+    Args:
+        ctrl_qubits (List[int]): The remainder ctrl_qubits in current iteration.
+        obj_qubit (int): The obj_qubit of the RY gate to be decomposed.
+        angle (:class:'~core.parameterresolver.ParameterResolver`): The rotation angle in current iteration.
+
+    Returns:
+        :class:`~.core.circuit.Circuit`, the decompose solution in current iteration.
+    """
+    circ = Circuit()
+    if len(ctrl_qubits) == 1:
+        circ.ry(angle/2, obj_qubit)
+        circ.x(obj_qubit, ctrl_qubits[0])
+        circ.ry(-angle/2, obj_qubit)
+        circ.x(obj_qubit, ctrl_qubits[0])
+    else:
+        circ += _cnry_frame(ctrl_qubits[1:], obj_qubit, angle/2)
+        circ.x(obj_qubit, ctrl_qubits[0])
+        circ += _cnry_frame(ctrl_qubits[1:], obj_qubit, -angle/2)
+        circ.x(obj_qubit, ctrl_qubits[0])
+    return circ
+
+def cnry_decompose(gate: gates.RY):
+    """
+    Decompose controlled :class:`~.core.gates.RY` gate.
+
+    Args:
+        gate (:class:`~.core.gates.RY`): A :class:`~.core.gates.RY` gate with zero or more control qubits.
+
+    Returns:
+        List[:class:`~.core.circuit.Circuit`], all possible decompose solution.
+
+    Examples:
+        >>> from mindquantum.algorithm.compiler import cnry_decompose
+        >>> from mindquantum.core.circuit import Circuit
+        >>> from mindquantum.core.gates import RY
+        >>> cry = RY(1).on(2, [0, 1])
+        >>> origin_circ = Circuit() + cry
+        >>> decomposed_circ = cnry_decompose(cry)[0]
+        >>> origin_circ
+        q0: ──────■───────
+                  ┃
+                  ┃
+        q1: ──────■───────
+                  ┃
+              ┏━━━┻━━━┓
+        q2: ──┨ RY(1) ┠───
+              ┗━━━━━━━┛
+        >>> decomposed_circ
+        q0: ─────────────────────────────────────────■──────────────────────────────────────────■────
+                                                     ┃                                          ┃
+                                                     ┃                                          ┃
+        q1: ────────────────■──────────────────■─────╂──────────────────■─────────────────■─────╂────
+                            ┃                  ┃     ┃                  ┃                 ┃     ┃
+              ┏━━━━━━━━━┓ ┏━┻━┓ ┏━━━━━━━━━━┓ ┏━┻━┓ ┏━┻━┓ ┏━━━━━━━━━━┓ ┏━┻━┓ ┏━━━━━━━━━┓ ┏━┻━┓ ┏━┻━┓
+        q2: ──┨ RY(1/4) ┠─┨╺╋╸┠─┨ RY(-1/4) ┠─┨╺╋╸┠─┨╺╋╸┠─┨ RY(-1/4) ┠─┨╺╋╸┠─┨ RY(1/4) ┠─┨╺╋╸┠─┨╺╋╸┠──
+              ┗━━━━━━━━━┛ ┗━━━┛ ┗━━━━━━━━━━┛ ┗━━━┛ ┗━━━┛ ┗━━━━━━━━━━┛ ┗━━━┛ ┗━━━━━━━━━┛ ┗━━━┛ ┗━━━┛
+    """
+    _check_input_type('gate', gates.RY, gate)
+
+    if not gate.ctrl_qubits:
+        return [Circuit(gate)]
+
+    circuit = _cnry_frame(gate.ctrl_qubits, gate.obj_qubits[0], gate.coeff)
+    return [circuit]
+
+
+decompose_rules = ['cry_decompose', 'cnry_decompose']
 __all__ = decompose_rules
diff --git a/mindquantum/algorithm/error_mitigation/VD.py b/mindquantum/algorithm/error_mitigation/VD.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dde252480a3e723312400de6b35d5ceee65502f
--- /dev/null
+++ b/mindquantum/algorithm/error_mitigation/VD.py
@@ -0,0 +1,195 @@
+"""
+VD.py - Virtual Distillation Algorithm Implementation
+=====================================================
+
+This module implements the virtual distillation algorithm for error mitigation in quantum circuits.
+
+Example:
+    >>> from mindquantum.core.gates import AmplitudeDampingChannel, H, CNOT, RX, Z, RY
+    >>> from mindquantum.core.circuit.channel_adder import BitFlipAdder
+    >>> from mindquantum.core.circuit import Circuit
+    >>> from mindquantum.simulator import Simulator
+    >>> from mindquantum.core.operators import Hamiltonian, QubitOperator
+    >>> import numpy as np
+
+    >>>    circ = Circuit()
+    >>>    circ += RY(1).on(0)
+    >>>    from mindquantum.core.gates import BitFlipChannel
+    >>>    noise_circ = circ + BitFlipChannel(0.005).on(0)
+    >>>    position = [0]
+    >>>    vd_calculator = VDCalculator(noise_circ, position, M=3, nshots=100000, para=None)
+    >>>    result = vd_calculator.calculate()
+    >>>    print(result)
+
+
+Note:
+    This module assumes that the input observable operator is sigma_z.
+"""
+
+from mindquantum.core.gates import SWAP, Givens
+from mindquantum.core.circuit import Circuit, apply
+from mindquantum.simulator import Simulator
+import numpy as np
+from mindquantum.core.gates import  RY
+from mindquantum.core.gates import BitFlipChannel
+
+class VDCalculator:
+    """
+    VDCalculator class for virtual distillation.
+
+    Attributes:
+        noise_circ: The circuit of the noisy state.
+        position: The position that expect operator act on.
+        Mp: Power.
+        nshots: The shots of the sampling.
+        para: The parameters for the parameters circuit if it has.
+        noise_circ.n_qubits: The qubit number of the origin circuit.
+        self.prepare_circuit(): Extended circuit based on power generation.
+
+    Methods:
+        __init__(Mp, noise_circ): Initializes the VDCalculator instance.
+        prepare_circuit(): Prepares the circuit for virtual distillation.
+        product_swap(self, res_l, res_r): Calculates the expectation of Swap gates.
+        product_with_z(self, res_l, res_r): Calculates the expectation of Sigma_Z.
+        calculate(): Calculates the virtual distillation result.
+    """
+    def __init__(self, noise_circ,  position, Mp=2, nshots=1000000, para=None):
+        self.noise_circ = noise_circ            # The circuit of the noisy state
+        self.position = position                # The position that expect operator act on
+        self.Mp = Mp                             # Power
+        self.nshots = nshots                    # The shots of the sampling
+        self.para = para                        # The parameters for the parameters circuit if it has
+        self.nqubits = noise_circ.n_qubits      # The qubit number of the origin circuit
+        self.circ = self.prepare_circuit()      # Extended circuit based on power generation
+
+    def prepare_circuit(self):
+        '''
+        The purpose of this function is to construct a direct product state in terms of power.
+        '''
+        circ = Circuit()
+        circ += apply(self.noise_circ, [i for i in range(self.nqubits)])
+        circ = circ.remove_measure()
+
+        if self.para is not None:
+            para_name = circ.params_name
+            pr = {key: value for key, value in zip(para_name, self.para)}
+            circ = circ.apply_value(pr)
+
+        temp = Circuit()
+        for i in range(self.Mp):
+            temp += apply(circ, [i * self.nqubits + j for j in range(self.nqubits)])
+        return temp
+
+    def product_swap(self, res_l, res_r):
+        '''
+        Calculate the trace of the density matrix to the M power.
+        '''
+        pr_l = res_l.data
+        qvec_l = np.zeros(2 ** (self.nqubits * self.Mp))
+        for key in pr_l:
+            qvec_l[int(key, 2)] = np.sqrt(pr_l[key] / self.nshots)
+
+        pr_r = res_r.data
+        qvec_r = np.zeros(2 ** (self.nqubits * self.Mp))
+
+        for key in pr_r:
+            qvec_r[int(key, 2)] = np.sqrt(pr_r[key] / self.nshots)
+
+        for key in pr_r:
+            for j in range(self.Mp-1):
+                for i in range(self.nqubits):
+                    if (int(key[i+j*self.nqubits], 2) == 1 and int(key[i + (j+1)*self.nqubits], 2) == 0):
+                        qvec_r[int(key, 2)] = - qvec_r[int(key, 2)]
+
+        return np.inner(qvec_l, qvec_r)
+
+    def product_with_z(self, res_l, res_r, position):
+        '''
+        Calculate the expectation with observable Pauli_Z.
+        '''
+        pr_l = res_l.data
+        qvec_l = np.zeros(2 ** (self.nqubits * self.Mp))
+        for key in pr_l:
+            qvec_l[int(key, 2)] = np.sqrt(pr_l[key] / self.nshots)
+
+        pr_r = res_r.data
+        qvec_r = np.zeros(2 ** (self.nqubits * self.Mp))
+
+        for key in pr_r:
+            qvec_r[int(key, 2)] = np.sqrt(pr_r[key] / self.nshots)
+
+        for key in pr_r:
+            for j in range(self.Mp-1):
+                for i in range(self.nqubits):
+                    if (int(key[i+j*self.nqubits], 2) == 1 and int(key[i + (j+1)*self.nqubits], 2) == 0):
+                        qvec_r[int(key, 2)] = - qvec_r[int(key, 2)]
+
+        for key in pr_r:
+            for i in range(len(position)):
+                if int(key[position[i]]) == 1:
+                    qvec_r[int(key, 2)] = -qvec_r[int(key, 2)]
+        return np.inner(qvec_l, qvec_r)
+
+    def calculate(self):
+        """
+        Calulate the result of VD by using the sampling result.
+        """
+        circ_swap = apply(self.circ, [i for i in range(self.circ.n_qubits)])
+        circ_copy = apply(self.circ, [i for i in range(self.circ.n_qubits)])
+
+        for i in range(self.Mp - 1):
+            for j in range(self.nqubits):
+                circ_copy += Givens(np.pi / 4).on([j + i * self.nqubits, j + (i + 1) * self.nqubits])
+
+        for i in range(self.Mp - 1):
+            for j in range(self.nqubits):
+                circ_swap += Givens(np.pi / 4).on([j + i * self.nqubits, j + (i + 1) * self.nqubits])
+                circ_swap += SWAP([j + i * self.nqubits, j + (i + 1) * self.nqubits])
+
+        sim = Simulator('mqvector', self.circ.n_qubits)
+        sim.reset()
+        res = sim.sampling(circ_copy.measure_all(), shots=self.nshots, seed=42)
+
+        sim.reset()
+        circ_swap = circ_swap.remove_measure()
+        res_s = sim.sampling(circ_swap.measure_all(), shots=self.nshots, seed=42)
+
+
+        trpm = self.product_swap(res, res_s)
+        print(trpm)
+        frac = 0
+
+        for t in range(self.Mp):
+            circ_o_swap = Circuit()
+            circ_o_swap += apply(circ_swap, [i for i in range(circ_swap.n_qubits)])
+
+            sim.reset()
+            res_r = sim.sampling(circ_o_swap, shots=self.nshots)
+            sim.reset()
+            pos = [self.position[i] + t * self.nqubits for i in range(len(self.position))]
+            frac += self.product_with_z(res_s, res_r, pos)
+
+        tropm = frac / self.Mp
+        exp = tropm / trpm
+        return exp
+
+# 使用示例
+# noise_circ, O, position, M, nshots, para 需要根据实际情况定义
+# vd_calculator = VDCalculator(noise_circ, O, position, M, nshots, para)
+# result = vd_calculator.calculate()
+# print(result)
+
+circuit = Circuit()
+# circ += RX(1).on(0)
+# circ += CNOT.on(1,0)
+circuit += RY(1).on(0)
+
+
+noise_c = circuit + BitFlipChannel(0.005).on(0)
+
+
+posi = [0]
+
+vd_calculator = VDCalculator(noise_c, posi, Mp=3, nshots=100000, para=None)
+result = vd_calculator.calculate()
+print(result)
diff --git a/mindquantum/algorithm/library/amplitude_encoder.py b/mindquantum/algorithm/library/amplitude_encoder.py
index 36958dc6159e91076373ceedf0c16f5ded0db02f..376f0508cdf7d6cdf6490f1edbe82c0f2d9eb356 100644
--- a/mindquantum/algorithm/library/amplitude_encoder.py
+++ b/mindquantum/algorithm/library/amplitude_encoder.py
@@ -21,29 +21,31 @@ from mindquantum.core.parameterresolver import ParameterResolver
 from mindquantum.utils.type_value_check import _check_input_type
 from mindquantum.utils import normalize
 
+
 def amp_circuit(n_qubits):
     '''Construct the quantum circuit of the amplitude_encoder.'''
     circ = Circuit()
-    circ += RY(f'alpha_{n_qubits}0').on(n_qubits-1)
-    circ += RZ(f'lambda_{n_qubits}0').on(n_qubits-1)
+    circ += RY(f'alpha_{n_qubits}_0').on(n_qubits - 1)
+    circ += RZ(f'lambda_{n_qubits}_0').on(n_qubits - 1)
     for i in range(1, n_qubits):
         for j in range(int(2**i)):
             string = bin(j)[2:].zfill(i)
             for tem_qubit, bit in enumerate(string):
                 qubit = int(n_qubits - tem_qubit)
                 if bit == '0':
-                    circ += X.on(qubit-1)
+                    circ += X.on(qubit - 1)
 
-            circ += RY(f'alpha_{int(n_qubits-i)}{j}').on(int(n_qubits-1-i), list(range(n_qubits-i, n_qubits)))
-            circ += RZ(f'lambda_{int(n_qubits-i)}{j}').on(int(n_qubits-1-i), list(range(n_qubits-i, n_qubits)))
+            circ += RY(f'alpha_{int(n_qubits-i)}_{j}').on(int(n_qubits - 1 - i), list(range(n_qubits - i, n_qubits)))
+            circ += RZ(f'lambda_{int(n_qubits-i)}_{j}').on(int(n_qubits - 1 - i), list(range(n_qubits - i, n_qubits)))
             string = bin(j)[2:].zfill(i)
 
             for tem_qubit, bit in enumerate(string):
                 qubit = int(n_qubits - tem_qubit)
                 if bit == '0':
-                    circ += X.on(qubit-1)
+                    circ += X.on(qubit - 1)
     return circ
 
+
 # pylint: disable=too-many-locals
 def amplitude_encoder(x, n_qubits):
     """
@@ -97,10 +99,10 @@ def amplitude_encoder(x, n_qubits):
     for i in range(len(vec)):
         eta[0].append(abs(vec[i]))
 
-    for v in range(1, n_qubits+1):
+    for v in range(1, n_qubits + 1):
         eta.append([])
-        for i in range(int(2**(n_qubits-v))):
-            eta[v].append(np.sqrt(eta[v-1][2*i]**2 + eta[v-1][2*i+1]**2))
+        for i in range(int(2 ** (n_qubits - v))):
+            eta[v].append(np.sqrt(eta[v - 1][2 * i] ** 2 + eta[v - 1][2 * i + 1] ** 2))
 
     omega_0 = []
     for i in range(len(vec)):
@@ -108,26 +110,26 @@ def amplitude_encoder(x, n_qubits):
 
     omega = [[]]
     for i in range(len(vec)):
-        omega[0].append(2*np.angle(vec[i]))
+        omega[0].append(2 * np.angle(vec[i]))
 
-    for v in range(1, n_qubits+1):
+    for v in range(1, n_qubits + 1):
         omega.append([])
-        for i in range(2**(n_qubits-v)):
-            omega[v].append(0.)
+        for i in range(2 ** (n_qubits - v)):
+            omega[v].append(0.0)
             for j in range(2**v):
-                omega[v][i] += omega_0[i*2**v+j]/2**(v-1)
+                omega[v][i] += omega_0[i * 2**v + j] / 2 ** (v - 1)
 
     alphas = {}
     for v in range(n_qubits, 0, -1):
-        for i in range(2**(n_qubits-v)):
+        for i in range(2 ** (n_qubits - v)):
             if eta[v][i] < 1e-6:
-                alphas[f'alpha_{v}{i}'] = 0
+                alphas[f'alpha_{v}_{i}'] = 0
             else:
-                alphas[f'alpha_{v}{i}'] = 2*np.arcsin(eta[v-1][2*i+1]/eta[v][i])
+                alphas[f'alpha_{v}_{i}'] = 2 * np.arcsin(eta[v - 1][2 * i + 1] / eta[v][i])
 
     lambs = {}
     for v in range(n_qubits, 0, -1):
-        for i in range(2**(n_qubits-v)):
-            lambs[f'lambda_{v}{i}'] = omega[v-1][2*i+1] - omega[v][i]
+        for i in range(2 ** (n_qubits - v)):
+            lambs[f'lambda_{v}_{i}'] = omega[v - 1][2 * i + 1] - omega[v][i]
 
     return amp_circuit(n_qubits), ParameterResolver({**alphas, **lambs})
diff --git a/mindquantum/algorithm/nisq/qnn/qn.py b/mindquantum/algorithm/nisq/qnn/qn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bcaf0619dcfc68bd4fca286e8ab30125c7320bf
--- /dev/null
+++ b/mindquantum/algorithm/nisq/qnn/qn.py
@@ -0,0 +1,294 @@
+'''
+该模块实现了文章arXiv:1711.11240v1中的quantum neuron单元和下面的quantum hopfield network
+'''
+
+import numpy as np
+from numpy import pi,cos,sin,sqrt
+from mindquantum.core import Circuit,X,H,RY,Y,RZ,Measure
+from mindquantum.core.circuit import dagger
+from mindquantum.simulator import Simulator  # pylint: disable=unused-import
+import matplotlib.pyplot as plt
+
+
+def load_w():
+    '''
+    根据文章中给的吸引子加载网络的权重
+    '''
+    mat = np.array([[0,1,1],[1,0,0],[0,1,1]])
+    mat1 = np.array([[1,0,1],[0,1,0],[0,1,0]])
+    data = np.array([mat,mat1])
+    w = w_matrix(data)
+    return w
+
+
+def mat2vec(mat):
+    '''
+    Convert matrix to vector
+    Args:
+        mat: the input matrix,the matrix dimension is (n,m,m) where n is the numbers of dataset
+    Returns:
+        the vector the shape is (n,m*m)
+    '''
+    return mat.flatten().tolist()
+
+
+def w_matrix(data):
+    '''
+    Input is three dimension
+    utilize the hebb rule to construct W matrix
+    '''
+    n_data = len(data)
+    dim = len(data[0]) ** 2
+    w = np.zeros((dim,dim))
+    for i in range(n_data):
+        vec = mat2vec(data[i])
+        w += np.outer(vec,vec)
+    diag_w = np.diag(np.diag(w))
+    w = w - diag_w
+    w /= n_data
+    return w
+
+
+def input_rotation(w,gamma,bias,update_qubit:int,num_update:int):
+    '''
+    Args:
+        w:the weight matrix
+        gamma: the scaling factor
+        update_qubit:which qubit needs to update
+        bias: the vector of bias
+    Return:
+        the circuit of input rotation
+    '''
+    assert len(w) == len(bias)
+    n = len(w)
+    circ = Circuit()
+    for ctrl_qubit in range(n):
+        if w[update_qubit][ctrl_qubit] == 0:
+            continue
+        circ += RY(4*gamma*w[update_qubit][ctrl_qubit]).on(n+num_update,ctrl_qubit)
+    circ += RY(2*bias[update_qubit]).on(n+num_update)
+    return circ
+
+
+def quantum_neuron(w,gamma,bias,update_qubit:int,num_update:int):
+    '''
+    实现quantum neuron单元接口
+    其中w为权重矩阵，gamma为放缩因子，bias为偏置，update_qubit为需要更新的量子比特,num_update为第几次更新
+    '''
+    n = len(w) # control qubit ,we need total n+1 qubits
+    rotationcirc = input_rotation(w,gamma,bias,update_qubit,num_update)
+    rotationcirc_dag = dagger(rotationcirc)
+    circ = Circuit()  # initial circuit instance
+    circ += rotationcirc
+    circ += Y.on(update_qubit,n+num_update)
+    circ += RZ(-pi/2).on(n+num_update)
+    circ += rotationcirc_dag
+    circ += Measure().on(n+num_update)
+    return circ
+
+
+def initial_state(vec):
+    '''
+    convert the vector to the circuit
+    '''
+    circ = Circuit()
+    for i,x in enumerate(vec):
+        if x == '+':
+            circ += H.on(i)
+        elif x == '0':
+            continue
+        elif x == '1':
+            circ += X.on(i)
+    return circ
+
+
+def q(theta,k):
+    '''
+    实现非线性激活函数
+    '''
+    return np.arctan(np.tan(0.7*theta)**(2**k))
+
+
+def ry(theta):
+    '''
+    Ry量子门的矩阵形式
+    '''
+    return np.array([[cos(theta/2),-sin(theta/2)],[sin(theta/2),cos(theta/2)]])
+
+
+def ry_activate(theta,psi):
+    '''
+    psi is the state which needs to activate
+    '''
+    act_psi = ry(2*theta) @ psi
+    return act_psi
+
+
+def first_update(vec1,w,gamma,bias,update_qubit=0,num_update=0):
+    '''
+    第一次更新
+    '''
+    circ = Circuit()
+    circ += initial_state(vec1)
+    circ += quantum_neuron(w,gamma,bias,update_qubit,num_update)
+
+    psi = vec1[update_qubit]
+    if psi == '+':
+        psi = 1/sqrt(2) * np.array([1,1])
+    elif psi == '1':
+        psi = np.array([0,1])
+    elif psi == '0':
+        psi = np.array([1,0])
+
+    theta0 = 4*gamma*0.5 + 2*bias[update_qubit]
+    theta1 = 4*gamma*1 + 2*bias[update_qubit]
+
+    act_psi = 1/sqrt(2)*ry_activate(q(theta0,1),psi) +1/sqrt(2)* ry_activate(q(theta1,1),psi)
+    if abs(act_psi[0]) >= abs(act_psi[1]):
+        vec1[update_qubit] = '1'
+    else:
+        vec1[update_qubit] = '0'
+    return vec1
+
+
+def second_update(vec2,w,gamma,bias,update_qubit=1,num_update=1):
+    '''
+    第二次更新
+    '''
+    circ = Circuit()
+    circ += initial_state(vec2)
+    circ += quantum_neuron(w,gamma,bias,update_qubit,num_update)
+    psi = vec2[update_qubit]
+
+    if psi == '+':
+        psi = 1/sqrt(2) * np.array([1,1])
+    elif psi == '1':
+        psi = np.array([0,1])
+    elif psi == '0':
+        psi = np.array([1,0])
+    theta0 = 4*gamma*1.5 + 2*bias[update_qubit]
+    theta1 = 4*gamma*2 + 2*bias[update_qubit]
+    act_psi = 1/sqrt(2)*ry_activate(q(theta0,1),psi) +1/sqrt(2)* ry_activate(q(theta1,1),psi)
+    if abs(act_psi[0]) <= abs(act_psi[1]):
+        vec2[update_qubit] = '1'
+    else:
+        vec2[update_qubit] = '0'
+    return vec2
+
+
+def third_update(vec3,w,gamma,bias,update_qubit=2,num_update=2):
+    '''
+    第三次更新
+    '''
+    circ = Circuit()
+    circ += initial_state(vec3)
+    circ += quantum_neuron(w,gamma,bias,update_qubit,num_update)
+    psi = vec3[update_qubit]
+
+    if psi == '+':
+        psi = 1/sqrt(2) * np.array([1,1])
+    elif psi == '1':
+        psi = np.array([0,1])
+    elif psi == '0':
+        psi = np.array([1,0])
+    theta0 = 4*gamma*2.5 + 2*bias[update_qubit]
+    act_psi = ry_activate(q(theta0,1),psi)
+    if abs(act_psi[0]) <= abs(act_psi[1]):
+        vec3[update_qubit] = '1'
+    else:
+        vec3[update_qubit] = '0'
+    return vec3
+
+
+def vec2mat(vec):
+    '''
+    convert vector to matrix
+    '''
+    vec = np.array(vec)
+    dim = len(vec)
+    data = np.reshape(vec,(int(sqrt(dim)),int(sqrt(dim))))
+    return data
+
+
+def str2int(vec):
+    '''
+    字符串转int数据类型
+    '''
+    new_vec = []
+    for i in vec:
+        if i == '0':
+            new_vec.append(0)
+        elif i == '+':
+            new_vec.append(0.5)
+        elif i=='1':
+            new_vec.append(1)
+    return new_vec
+
+def plot(vec):
+    '''
+    画某个子图
+    '''
+    new_vec = str2int(vec)
+    mat = vec2mat(new_vec)
+    plt.imshow(mat)
+    plt.axis('off')
+    plt.colorbar()
+    # plt.savefig(f'../fig/result/update_{update}.png')
+    plt.show()
+
+
+def total_plot(total_state):
+    '''
+    画出总的更新图
+    '''
+    total_state1 = []
+    for i in total_state:
+        total_state1.append(str2int(i))
+    fig, axs = plt.subplots(1,4, figsize=(20,6))  # 1 row, 4 columns of plots
+    titles = ['initial input', 'after 1 update', 'after 2 updates', 'after 3 updates']
+    cmap = plt.cm.viridis  # pylint: disable=no-member
+    norm = plt.Normalize(0, 1)
+    cbar = axs[0].imshow(vec2mat(total_state1[0]), cmap=cmap, norm=norm)
+    axs[0].set_title(titles[0])
+    for i in range(1,len(total_state1)):
+        updated_matrix = vec2mat(total_state1[i])
+        axs[i].imshow(updated_matrix, cmap=cmap, norm=norm)
+        axs[i].set_title(titles[i])
+
+    fig.colorbar(cbar, ax=axs, orientation='vertical', fraction=0.02, pad=0.04)
+    for ax in axs:
+        ax.axis('off')
+    # plt.tight_layout()
+    plt.savefig('../fig/result/total_update1.png')
+    plt.show()
+
+
+def main():
+    '''
+    把所有的都集成到main函数上
+    '''
+    inital_state = ['+','+','+','1','0','0','0','1','1']
+    w = load_w()
+    bias = np.array([np.pi/4] * 9 )
+    gamma = 0.7 /(w.max()*len(w) + bias.max())
+    total_state = []
+    total_state.append(inital_state.copy())
+    for i in range(3):
+        if i == 0:
+            update_state = first_update(inital_state,w,gamma,bias)
+            plot(update_state)
+            total_state.append(update_state.copy())
+        elif i == 1:
+            update_state = second_update(update_state,w,gamma,bias)
+            plot(update_state)
+            total_state.append(update_state.copy())
+        elif i == 2:
+            update_state = third_update(update_state,w,gamma,bias)
+            plot(update_state)
+            total_state.append(update_state.copy())
+    total_plot(total_state)
+
+
+if __name__ == '__main__':
+    main()
+    
\ No newline at end of file
diff --git a/mindquantum/algorithm/qaia/SB.py b/mindquantum/algorithm/qaia/SB.py
index a7b0a82ef08276dc50b0a51d9301a03f4c4ed60e..50143160709fd4d3fc7b618ba0bd2b1fb479df19 100644
--- a/mindquantum/algorithm/qaia/SB.py
+++ b/mindquantum/algorithm/qaia/SB.py
@@ -14,6 +14,7 @@
 # ============================================================================
 """Simulated bifurcation (SB) algorithms and its variants."""
 # pylint: disable=invalid-name
+import warnings
 import numpy as np
 from scipy.sparse import csr_matrix
 
@@ -143,36 +144,70 @@ class BSB(SB):  # noqa: N801
         batch_size (int): The number of sampling. Default: ``1``.
         dt (float): The step size. Default: ``1``.
         xi (float): positive constant with the dimension of frequency. Default: ``None``.
+        device (str): The device to use for computation ('cpu' or 'gpu'). Default: ``'cpu'``.
+        precision (str): Precision type when using GPU ('float32', 'float16', 'int8'). Default: ``'float32'``.
     """
 
     # pylint: disable=too-many-arguments
-    def __init__(
-        self,
-        J,
-        h=None,
-        x=None,
-        n_iter=1000,
-        batch_size=1,
-        dt=1,
-        xi=None,
-    ):
+    def __init__(self, J, h=None, x=None, n_iter=1000, batch_size=1, dt=1, xi=None, device='cpu', precision='float32'):
         """Construct BSB algorithm."""
         super().__init__(J, h, x, n_iter, batch_size, dt, xi)
+
+        self.device = device.lower()
+        if self.device == 'gpu':
+            try:
+                from mindquantum import _qaia_sb
+            except ImportError:
+                warnings.warn(
+                    "Unable to import bSB GPU backend. This could be due to your environment "
+                    "not satisfying the requirements. Will use CPU backend instead.",
+                    stacklevel=2,
+                )
+                self.device = 'cpu'
+            except RuntimeError as err:
+                warnings.warn(f"Disable bSB GPU backend due to: {err}. Will use CPU backend instead.", stacklevel=2)
+                self.device = 'cpu'
+
+        self.precision = precision.lower()
+        if self.precision not in ['float32', 'float16', 'int8']:
+            raise ValueError("precision must be one of 'float32', 'float16', 'int8'")
+
+        if self.precision != 'float32' and device.lower() == 'cpu':
+            raise NotImplementedError(f"BSB with {precision} precision only supports GPU computation")
+
         self.initialize()
 
+        if self.device == 'gpu':
+            _qaia_sb.cuda_init(self.J.shape[0], self.batch_size)
+            if self.precision == 'float32':
+                self._update_func = _qaia_sb.bsb_update
+            elif self.precision == 'float16':
+                self._update_func = _qaia_sb.bsb_update_half if self.h is None else _qaia_sb.bsb_update_h_half
+            else:  # int8
+                self._update_func = _qaia_sb.bsb_update_int8 if self.h is None else _qaia_sb.bsb_update_h_int8
+            self._is_gpu = True
+        else:
+            self._update_func = self._cpu_update
+            self._is_gpu = False
+
+    # pylint: disable=unused-argument
+    def _cpu_update(self, J, x, h, batch_size, xi, delta, dt, n_iter):
+        """CPU implementation of update."""
+        for i in range(n_iter):
+            if h is None:
+                self.y += (-(delta - self.p[i]) * x + xi * J.dot(x)) * dt
+            else:
+                self.y += (-(delta - self.p[i]) * x + xi * (J.dot(x) + h)) * dt
+            x += dt * self.y * delta
+
+            cond = np.abs(x) > 1
+            x = np.where(cond, np.sign(x), x)
+            self.y = np.where(cond, np.zeros_like(x), self.y)
+
     # pylint: disable=attribute-defined-outside-init
     def update(self):
         """Dynamical evolution based on Modified explicit symplectic Euler method."""
-        for i in range(self.n_iter):
-            if self.h is None:
-                self.y += (-(self.delta - self.p[i]) * self.x + self.xi * self.J.dot(self.x)) * self.dt
-            else:
-                self.y += (-(self.delta - self.p[i]) * self.x + self.xi * (self.J.dot(self.x) + self.h)) * self.dt
-            self.x += self.dt * self.y * self.delta
-
-            cond = np.abs(self.x) > 1
-            self.x = np.where(cond, np.sign(self.x), self.x)
-            self.y = np.where(cond, np.zeros_like(self.x), self.y)
+        self._update_func(self.J, self.x, self.h, self.batch_size, self.xi, self.delta, self.dt, self.n_iter)
 
 
 class DSB(SB):  # noqa: N801
@@ -190,36 +225,66 @@ class DSB(SB):  # noqa: N801
         batch_size (int): The number of sampling. Default: ``1``.
         dt (float): The step size. Default: ``1``.
         xi (float): positive constant with the dimension of frequency. Default: ``None``.
+        device (str): The device to use for computation ('cpu' or 'gpu'). Default: ``'cpu'``.
+        precision (str): Precision type when using GPU ('float32', 'float16', 'int8'). Default: ``'float32'``.
     """
 
     # pylint: disable=too-many-arguments
-    def __init__(
-        self,
-        J,
-        h=None,
-        x=None,
-        n_iter=1000,
-        batch_size=1,
-        dt=1,
-        xi=None,
-    ):
+    def __init__(self, J, h=None, x=None, n_iter=1000, batch_size=1, dt=1, xi=None, device='cpu', precision='float32'):
         """Construct DSB algorithm."""
         super().__init__(J, h, x, n_iter, batch_size, dt, xi)
+
+        self.device = device.lower()
+        if self.device == 'gpu':
+            try:
+                from mindquantum import _qaia_sb
+            except ImportError:
+                warnings.warn(
+                    "Unable to import dSB GPU backend. This could be due to your environment "
+                    "not satisfying the requirements. Will use CPU backend instead.",
+                    stacklevel=2,
+                )
+                self.device = 'cpu'
+            except RuntimeError as err:
+                warnings.warn(f"Disable dSB GPU backend due to: {err}. Will use CPU backend instead.", stacklevel=2)
+                self.device = 'cpu'
+
+        self.precision = precision.lower()
+        if self.precision not in ['float32', 'float16', 'int8']:
+            raise ValueError("precision must be one of 'float32', 'float16', 'int8'")
+
+        if self.precision != 'float32' and device.lower() == 'cpu':
+            raise NotImplementedError(f"DSB with {precision} precision only supports GPU computation")
+
         self.initialize()
+        if self.device == 'gpu':
+            _qaia_sb.cuda_init(self.J.shape[0], self.batch_size)
+            if self.precision == 'float32':
+                self._update_func = _qaia_sb.dsb_update
+            elif self.precision == 'float16':
+                self._update_func = _qaia_sb.dsb_update_half if self.h is None else _qaia_sb.dsb_update_h_half
+            else:  # int8
+                self._update_func = _qaia_sb.dsb_update_int8 if self.h is None else _qaia_sb.dsb_update_h_int8
+            self._is_gpu = True
+        else:
+            self._update_func = self._cpu_update
+            self._is_gpu = False
+
+    # pylint: disable=unused-argument
+    def _cpu_update(self, J, x, h, batch_size, xi, delta, dt, n_iter):
+        """CPU implementation of update."""
+        for i in range(n_iter):
+            if h is None:
+                self.y += (-(delta - self.p[i]) * x + xi * J.dot(np.sign(x))) * dt
+            else:
+                self.y += (-(delta - self.p[i]) * x + xi * (J.dot(np.sign(x)) + h)) * dt
+            x += dt * self.y * delta
+
+            cond = np.abs(x) > 1
+            x = np.where(cond, np.sign(x), x)
+            self.y = np.where(cond, np.zeros_like(x), self.y)
 
     # pylint: disable=attribute-defined-outside-init
     def update(self):
         """Dynamical evolution based on Modified explicit symplectic Euler method."""
-        for i in range(self.n_iter):
-            if self.h is None:
-                self.y += (-(self.delta - self.p[i]) * self.x + self.xi * self.J.dot(np.sign(self.x))) * self.dt
-            else:
-                self.y += (
-                    -(self.delta - self.p[i]) * self.x + self.xi * (self.J.dot(np.sign(self.x)) + self.h)
-                ) * self.dt
-
-            self.x += self.dt * self.y * self.delta
-
-            cond = np.abs(self.x) > 1
-            self.x = np.where(cond, np.sign(self.x), self.x)
-            self.y = np.where(cond, np.zeros_like(self.y), self.y)
+        self._update_func(self.J, self.x, self.h, self.batch_size, self.xi, self.delta, self.dt, self.n_iter)
diff --git a/setup.py b/setup.py
index f142323cf42bff5bf4dca7c7ce7c11b56c445c68..1b881a9e09e807fed6e8ef28b452cdc58c52f6c8 100644
--- a/setup.py
+++ b/setup.py
@@ -504,6 +504,7 @@ ext_modules = [
     CMakeExtension(pymod='mindquantum._mq_vector_gpu', optional=True),
     CMakeExtension(pymod='mindquantum._mq_matrix'),
     CMakeExtension(pymod='mindquantum._math'),
+    CMakeExtension(pymod='mindquantum._qaia_sb', optional=True),
 ]
 
 
diff --git a/tests/st/test_algorithm/test_compiler/test_decompose/test_fixed_decompose/test_ry_decompose.py b/tests/st/test_algorithm/test_compiler/test_decompose/test_fixed_decompose/test_ry_decompose.py
index a2532b21497253b16457ebbb5b4ade6f45b88759..74f4000a5a1d5b8cd8b3ebd6d66c4b9d0863a3d9 100644
--- a/tests/st/test_algorithm/test_compiler/test_decompose/test_fixed_decompose/test_ry_decompose.py
+++ b/tests/st/test_algorithm/test_compiler/test_decompose/test_fixed_decompose/test_ry_decompose.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-from mindquantum.algorithm.compiler.decompose import cry_decompose
+from mindquantum.algorithm.compiler.decompose import cry_decompose, cnry_decompose
 from mindquantum.core.circuit import Circuit
 from mindquantum.core.gates import RY
 
@@ -39,3 +39,15 @@ def test_cry():
     cry = RY(1.23).on(1, 0)
     for solution in cry_decompose(cry):
         circuit_equal_test(cry, solution)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+def test_cnry():
+    """
+    Description: Test cnry decompose
+    Expectation: success
+    """
+    cnry = RY(1.23).on(2, [0, 1])
+    for solution in cnry_decompose(cnry):
+        circuit_equal_test(cnry, solution)
diff --git a/tests/st/test_algorithm/test_qaia/test_qaia.py b/tests/st/test_algorithm/test_qaia/test_qaia.py
index 9e0412285b72314728490da73245401270c1e090..f9374f79d8729fe120faae9b6e32c572baa8df78 100644
--- a/tests/st/test_algorithm/test_qaia/test_qaia.py
+++ b/tests/st/test_algorithm/test_qaia/test_qaia.py
@@ -22,7 +22,19 @@ from scipy.sparse import coo_matrix
 from mindquantum.algorithm.qaia import ASB, BSB, CAC, CFC, DSB, LQA, SFC
 from mindquantum.utils.fdopen import fdopen
 
+import pytest
 
+try:
+    # pylint: disable=unused-import
+    from mindquantum import _qaia_sb
+
+    GPU_AVAILABLE = True
+except (ImportError, RuntimeError):
+    GPU_AVAILABLE = False
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
 def read_gset(filename, negate=True):
     """
     Reading Gset and transform it into sparse matrix
@@ -65,6 +77,8 @@ def read_gset(filename, negate=True):
 G = read_gset(str(Path(__file__).parent.parent.parent / 'G1.txt'))
 
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
 def test_aSB():
     """
     Description: Test ASB
@@ -84,6 +98,8 @@ def test_aSB():
     np.allclose(x, solver.x)
 
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
 def test_bSB():
     """
     Description: Test BSB
@@ -103,6 +119,68 @@ def test_bSB():
     np.allclose(x, solver.x)
 
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+@pytest.mark.skipif(not GPU_AVAILABLE, reason="GPU backend not available")
+def test_bSB_gpu():
+    """
+    Description: Test BSB GPU implementation
+    Expectation: success
+    """
+    N = G.shape[0]
+    np.random.seed(666)
+    x = 0.01 * (np.random.rand(N, 1) - 0.5)
+    y = 0.01 * (np.random.rand(N, 1) - 0.5)
+
+    # Test float32 precision
+    solver_gpu = BSB(G, n_iter=1, device='gpu', precision='float32')
+    solver_cpu = BSB(G, n_iter=1, device='cpu')
+
+    solver_gpu.x = x.copy()
+    solver_gpu.y = y.copy()
+    solver_cpu.x = x.copy()
+    solver_cpu.y = y.copy()
+
+    solver_gpu.update()
+    solver_cpu.update()
+    assert np.allclose(solver_gpu.x, solver_cpu.x, rtol=1e-5, atol=1e-5)
+
+    # Test with external field
+    h = np.random.rand(N, 1)
+    solver_gpu_h = BSB(G, h=h, n_iter=1, device='gpu', precision='float32')
+    solver_cpu_h = BSB(G, h=h, n_iter=1, device='cpu')
+
+    solver_gpu_h.x = x.copy()
+    solver_gpu_h.y = y.copy()
+    solver_cpu_h.x = x.copy()
+    solver_cpu_h.y = y.copy()
+
+    solver_gpu_h.update()
+    solver_cpu_h.update()
+    assert np.allclose(solver_gpu_h.x, solver_cpu_h.x, rtol=1e-5, atol=1e-5)
+
+    # Test float16 precision
+    solver_fp16 = BSB(G, n_iter=1, device='gpu', precision='float16')
+    solver_fp16.x = x.copy()
+    solver_fp16.y = y.copy()
+    solver_fp16.update()
+    assert np.allclose(
+        solver_fp16.x, solver_cpu.x, rtol=1e-3, atol=1e-3
+    )  # Lower precision for float16, relaxed error bounds
+
+    # Test int8 precision
+    solver_int8 = BSB(G, n_iter=1, device='gpu', precision='int8')
+    solver_int8.x = x.copy()
+    solver_int8.y = y.copy()
+    solver_int8.update()
+    assert np.allclose(
+        solver_int8.x, solver_cpu.x, rtol=1e-2, atol=1e-2
+    )  # Lowest precision for int8, further relaxed error bounds
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
 def test_dSB():
     """
     Description: Test DSB
@@ -122,6 +200,68 @@ def test_dSB():
     np.allclose(x, solver.x)
 
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.env_onecard
+@pytest.mark.skipif(not GPU_AVAILABLE, reason="GPU backend not available")
+def test_dSB_gpu():
+    """
+    Description: Test DSB GPU implementation
+    Expectation: success
+    """
+    N = G.shape[0]
+    np.random.seed(666)
+    x = 0.01 * (np.random.rand(N, 1) - 0.5)
+    y = 0.01 * (np.random.rand(N, 1) - 0.5)
+
+    # Test float32 precision
+    solver_gpu = DSB(G, n_iter=1, device='gpu', precision='float32')
+    solver_cpu = DSB(G, n_iter=1, device='cpu')
+
+    solver_gpu.x = x.copy()
+    solver_gpu.y = y.copy()
+    solver_cpu.x = x.copy()
+    solver_cpu.y = y.copy()
+
+    solver_gpu.update()
+    solver_cpu.update()
+    assert np.allclose(solver_gpu.x, solver_cpu.x, rtol=1e-5, atol=1e-5)
+
+    # Test with external field
+    h = np.random.rand(N, 1)
+    solver_gpu_h = DSB(G, h=h, n_iter=1, device='gpu', precision='float32')
+    solver_cpu_h = DSB(G, h=h, n_iter=1, device='cpu')
+
+    solver_gpu_h.x = x.copy()
+    solver_gpu_h.y = y.copy()
+    solver_cpu_h.x = x.copy()
+    solver_cpu_h.y = y.copy()
+
+    solver_gpu_h.update()
+    solver_cpu_h.update()
+    assert np.allclose(solver_gpu_h.x, solver_cpu_h.x, rtol=1e-5, atol=1e-5)
+
+    # Test float16 precision
+    solver_fp16 = DSB(G, n_iter=1, device='gpu', precision='float16')
+    solver_fp16.x = x.copy()
+    solver_fp16.y = y.copy()
+    solver_fp16.update()
+    assert np.allclose(
+        solver_fp16.x, solver_cpu.x, rtol=1e-3, atol=1e-3
+    )  # Lower precision for float16, relaxed error bounds
+
+    # Test int8 precision
+    solver_int8 = DSB(G, n_iter=1, device='gpu', precision='int8')
+    solver_int8.x = x.copy()
+    solver_int8.y = y.copy()
+    solver_int8.update()
+    assert np.allclose(
+        solver_int8.x, solver_cpu.x, rtol=1e-2, atol=1e-2
+    )  # Lowest precision for int8, further relaxed error bounds
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
 def test_LQA():
     """
     Description: Test LQA
@@ -154,6 +294,8 @@ def test_LQA():
     np.allclose(x, solver.x)
 
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
 def test_CAC():
     """
     Description: Test CAC
@@ -172,6 +314,8 @@ def test_CAC():
     np.allclose(x, solver.x)
 
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
 def test_CFC():
     """
     Description: Test CFC
@@ -191,6 +335,8 @@ def test_CFC():
     np.allclose(x, solver.x)
 
 
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
 def test_SFC():
     """
     Description: Test SFC