From 86f587039654ac37d01342e66049f2903ccdfadd Mon Sep 17 00:00:00 2001
From: GiggleLiu <cacate0129@gmail.com>
Date: Sun, 25 Feb 2024 00:35:48 +0800
Subject: [PATCH 1/6] add CuYao as extension

---
 Project.toml                        |   8 +
 ext/CuYao.jl                        |   3 +
 ext/CuYao/.gitignore                |  23 ++
 ext/CuYao/LICENSE.md                | 217 +++++++++++++++++++
 ext/CuYao/Project.toml              |  27 +++
 ext/CuYao/README.md                 |  86 ++++++++
 ext/CuYao/benchmarks/TestCuda.jl    |  54 +++++
 ext/CuYao/benchmarks/gates.jl       |  19 ++
 ext/CuYao/benchmarks/gcompile.jl    |  92 ++++++++
 ext/CuYao/benchmarks/paralleldot.jl |  11 +
 ext/CuYao/benchmarks/statexpect.jl  |  13 ++
 ext/CuYao/src/CUDApatch.jl          |  64 ++++++
 ext/CuYao/src/CuYao.jl              |  39 ++++
 ext/CuYao/src/instructs.jl          | 312 ++++++++++++++++++++++++++++
 ext/CuYao/src/register.jl           | 290 ++++++++++++++++++++++++++
 ext/CuYao/test/CUDApatch.jl         |  35 ++++
 ext/CuYao/test/extra.jl             |  52 +++++
 ext/CuYao/test/instructs.jl         | 135 ++++++++++++
 ext/CuYao/test/register.jl          | 139 +++++++++++++
 ext/CuYao/test/runtests.jl          |  18 ++
 lib/YaoPlots/src/vizcircuit.jl      |  14 +-
 21 files changed, 1644 insertions(+), 7 deletions(-)
 create mode 100644 ext/CuYao.jl
 create mode 100644 ext/CuYao/.gitignore
 create mode 100644 ext/CuYao/LICENSE.md
 create mode 100644 ext/CuYao/Project.toml
 create mode 100644 ext/CuYao/README.md
 create mode 100644 ext/CuYao/benchmarks/TestCuda.jl
 create mode 100644 ext/CuYao/benchmarks/gates.jl
 create mode 100644 ext/CuYao/benchmarks/gcompile.jl
 create mode 100644 ext/CuYao/benchmarks/paralleldot.jl
 create mode 100644 ext/CuYao/benchmarks/statexpect.jl
 create mode 100644 ext/CuYao/src/CUDApatch.jl
 create mode 100644 ext/CuYao/src/CuYao.jl
 create mode 100644 ext/CuYao/src/instructs.jl
 create mode 100644 ext/CuYao/src/register.jl
 create mode 100644 ext/CuYao/test/CUDApatch.jl
 create mode 100644 ext/CuYao/test/extra.jl
 create mode 100644 ext/CuYao/test/instructs.jl
 create mode 100644 ext/CuYao/test/register.jl
 create mode 100644 ext/CuYao/test/runtests.jl

diff --git a/Project.toml b/Project.toml
index 88493caf..84787249 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ version = "0.8.13"
 
 [deps]
 BitBasis = "50ba71b6-fa0f-514d-ae9a-0916efc90dcf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LuxurySparse = "d05aeea4-b7d4-55ac-b691-9e7fabb07ba2"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
@@ -13,8 +14,15 @@ YaoBlocks = "418bc28f-b43b-5e0b-a6e7-61bbc1a2c1df"
 YaoPlots = "32cfe2d9-419e-45f2-8191-2267705d8dbc"
 YaoSym = "3b27209a-d3d6-11e9-3c0f-41eb92b2cb9d"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+CuYao = "CUDA"
+
 [compat]
 BitBasis = "0.8, 0.9"
+CUDA = "4, 5"
 LinearAlgebra = "1"
 LuxurySparse = "0.7"
 Reexport = "1"
diff --git a/ext/CuYao.jl b/ext/CuYao.jl
new file mode 100644
index 00000000..1421abce
--- /dev/null
+++ b/ext/CuYao.jl
@@ -0,0 +1,3 @@
+module CuYao
+include("CuYao/src/CuYao.jl")
+end
diff --git a/ext/CuYao/.gitignore b/ext/CuYao/.gitignore
new file mode 100644
index 00000000..56b59ee8
--- /dev/null
+++ b/ext/CuYao/.gitignore
@@ -0,0 +1,23 @@
+*.jl.cov
+*.jl.*.cov
+*.jl.mem
+
+docs/build/
+docs/site/
+docs/src/tutorial/RegisterBasics.md
+docs/src/tutorial/BlockBasics.md
+docs/src/tutorial/BinaryBasics.md
+docs/src/tutorial/QCBM.md
+
+*.ipynb_checkpoints
+**/*.ipynb_checkpoints
+**/**/*.ipynb_checkpoints
+
+_*.dat
+*.swp
+__pycache__/
+.vscode/
+
+Manifest.toml
+
+_local/
diff --git a/ext/CuYao/LICENSE.md b/ext/CuYao/LICENSE.md
new file mode 100644
index 00000000..8373be63
--- /dev/null
+++ b/ext/CuYao/LICENSE.md
@@ -0,0 +1,217 @@
+The CuYao.jl package is licensed under the Apache License, Version 2.0:
+
+> COPYRIGHT
+>
+> All contributions by Jinguo Liu
+> Copyright (c) 2018: Jinguo Liu
+> All rights reserved
+>
+> All contributions by Xiuzhe Luo
+> Copyright (c) 2018: Xiuzhe Luo.
+> All rights reserved.
+>
+> Each contributor holds copyright over their respective contributions.
+> The project versioning (Git) records all such contribution source information.
+>
+>                                  Apache License
+>                            Version 2.0, January 2004
+>                         http://www.apache.org/licenses/
+>
+>    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+>
+>    1. Definitions.
+>
+>       "License" shall mean the terms and conditions for use, reproduction,
+>       and distribution as defined by Sections 1 through 9 of this document.
+>
+>       "Licensor" shall mean the copyright owner or entity authorized by
+>       the copyright owner that is granting the License.
+>
+>       "Legal Entity" shall mean the union of the acting entity and all
+>       other entities that control, are controlled by, or are under common
+>       control with that entity. For the purposes of this definition,
+>       "control" means (i) the power, direct or indirect, to cause the
+>       direction or management of such entity, whether by contract or
+>       otherwise, or (ii) ownership of fifty percent (50%) or more of the
+>       outstanding shares, or (iii) beneficial ownership of such entity.
+>
+>       "You" (or "Your") shall mean an individual or Legal Entity
+>       exercising permissions granted by this License.
+>
+>       "Source" form shall mean the preferred form for making modifications,
+>       including but not limited to software source code, documentation
+>       source, and configuration files.
+>
+>       "Object" form shall mean any form resulting from mechanical
+>       transformation or translation of a Source form, including but
+>       not limited to compiled object code, generated documentation,
+>       and conversions to other media types.
+>
+>       "Work" shall mean the work of authorship, whether in Source or
+>       Object form, made available under the License, as indicated by a
+>       copyright notice that is included in or attached to the work
+>       (an example is provided in the Appendix below).
+>
+>       "Derivative Works" shall mean any work, whether in Source or Object
+>       form, that is based on (or derived from) the Work and for which the
+>       editorial revisions, annotations, elaborations, or other modifications
+>       represent, as a whole, an original work of authorship. For the purposes
+>       of this License, Derivative Works shall not include works that remain
+>       separable from, or merely link (or bind by name) to the interfaces of,
+>       the Work and Derivative Works thereof.
+>
+>       "Contribution" shall mean any work of authorship, including
+>       the original version of the Work and any modifications or additions
+>       to that Work or Derivative Works thereof, that is intentionally
+>       submitted to Licensor for inclusion in the Work by the copyright owner
+>       or by an individual or Legal Entity authorized to submit on behalf of
+>       the copyright owner. For the purposes of this definition, "submitted"
+>       means any form of electronic, verbal, or written communication sent
+>       to the Licensor or its representatives, including but not limited to
+>       communication on electronic mailing lists, source code control systems,
+>       and issue tracking systems that are managed by, or on behalf of, the
+>       Licensor for the purpose of discussing and improving the Work, but
+>       excluding communication that is conspicuously marked or otherwise
+>       designated in writing by the copyright owner as "Not a Contribution."
+>
+>       "Contributor" shall mean Licensor and any individual or Legal Entity
+>       on behalf of whom a Contribution has been received by Licensor and
+>       subsequently incorporated within the Work.
+>
+>    2. Grant of Copyright License. Subject to the terms and conditions of
+>       this License, each Contributor hereby grants to You a perpetual,
+>       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+>       copyright license to reproduce, prepare Derivative Works of,
+>       publicly display, publicly perform, sublicense, and distribute the
+>       Work and such Derivative Works in Source or Object form.
+>
+>    3. Grant of Patent License. Subject to the terms and conditions of
+>       this License, each Contributor hereby grants to You a perpetual,
+>       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+>       (except as stated in this section) patent license to make, have made,
+>       use, offer to sell, sell, import, and otherwise transfer the Work,
+>       where such license applies only to those patent claims licensable
+>       by such Contributor that are necessarily infringed by their
+>       Contribution(s) alone or by combination of their Contribution(s)
+>       with the Work to which such Contribution(s) was submitted. If You
+>       institute patent litigation against any entity (including a
+>       cross-claim or counterclaim in a lawsuit) alleging that the Work
+>       or a Contribution incorporated within the Work constitutes direct
+>       or contributory patent infringement, then any patent licenses
+>       granted to You under this License for that Work shall terminate
+>       as of the date such litigation is filed.
+>
+>    4. Redistribution. You may reproduce and distribute copies of the
+>       Work or Derivative Works thereof in any medium, with or without
+>       modifications, and in Source or Object form, provided that You
+>       meet the following conditions:
+>
+>       (a) You must give any other recipients of the Work or
+>           Derivative Works a copy of this License; and
+>
+>       (b) You must cause any modified files to carry prominent notices
+>           stating that You changed the files; and
+>
+>       (c) You must retain, in the Source form of any Derivative Works
+>           that You distribute, all copyright, patent, trademark, and
+>           attribution notices from the Source form of the Work,
+>           excluding those notices that do not pertain to any part of
+>           the Derivative Works; and
+>
+>       (d) If the Work includes a "NOTICE" text file as part of its
+>           distribution, then any Derivative Works that You distribute must
+>           include a readable copy of the attribution notices contained
+>           within such NOTICE file, excluding those notices that do not
+>           pertain to any part of the Derivative Works, in at least one
+>           of the following places: within a NOTICE text file distributed
+>           as part of the Derivative Works; within the Source form or
+>           documentation, if provided along with the Derivative Works; or,
+>           within a display generated by the Derivative Works, if and
+>           wherever such third-party notices normally appear. The contents
+>           of the NOTICE file are for informational purposes only and
+>           do not modify the License. You may add Your own attribution
+>           notices within Derivative Works that You distribute, alongside
+>           or as an addendum to the NOTICE text from the Work, provided
+>           that such additional attribution notices cannot be construed
+>           as modifying the License.
+>
+>       You may add Your own copyright statement to Your modifications and
+>       may provide additional or different license terms and conditions
+>       for use, reproduction, or distribution of Your modifications, or
+>       for any such Derivative Works as a whole, provided Your use,
+>       reproduction, and distribution of the Work otherwise complies with
+>       the conditions stated in this License.
+>
+>    5. Submission of Contributions. Unless You explicitly state otherwise,
+>       any Contribution intentionally submitted for inclusion in the Work
+>       by You to the Licensor shall be under the terms and conditions of
+>       this License, without any additional terms or conditions.
+>       Notwithstanding the above, nothing herein shall supersede or modify
+>       the terms of any separate license agreement you may have executed
+>       with Licensor regarding such Contributions.
+>
+>    6. Trademarks. This License does not grant permission to use the trade
+>       names, trademarks, service marks, or product names of the Licensor,
+>       except as required for reasonable and customary use in describing the
+>       origin of the Work and reproducing the content of the NOTICE file.
+>
+>    7. Disclaimer of Warranty. Unless required by applicable law or
+>       agreed to in writing, Licensor provides the Work (and each
+>       Contributor provides its Contributions) on an "AS IS" BASIS,
+>       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+>       implied, including, without limitation, any warranties or conditions
+>       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+>       PARTICULAR PURPOSE. You are solely responsible for determining the
+>       appropriateness of using or redistributing the Work and assume any
+>       risks associated with Your exercise of permissions under this License.
+>
+>    8. Limitation of Liability. In no event and under no legal theory,
+>       whether in tort (including negligence), contract, or otherwise,
+>       unless required by applicable law (such as deliberate and grossly
+>       negligent acts) or agreed to in writing, shall any Contributor be
+>       liable to You for damages, including any direct, indirect, special,
+>       incidental, or consequential damages of any character arising as a
+>       result of this License or out of the use or inability to use the
+>       Work (including but not limited to damages for loss of goodwill,
+>       work stoppage, computer failure or malfunction, or any and all
+>       other commercial damages or losses), even if such Contributor
+>       has been advised of the possibility of such damages.
+>
+>    9. Accepting Warranty or Additional Liability. While redistributing
+>       the Work or Derivative Works thereof, You may choose to offer,
+>       and charge a fee for, acceptance of support, warranty, indemnity,
+>       or other liability obligations and/or rights consistent with this
+>       License. However, in accepting such obligations, You may act only
+>       on Your own behalf and on Your sole responsibility, not on behalf
+>       of any other Contributor, and only if You agree to indemnify,
+>       defend, and hold each Contributor harmless for any liability
+>       incurred by, or claims asserted against, such Contributor by reason
+>       of your accepting any such warranty or additional liability.
+>
+>    END OF TERMS AND CONDITIONS
+>
+>    APPENDIX: How to apply the Apache License to your work.
+>
+>       To apply the Apache License to your work, attach the following
+>       boilerplate notice, with the fields enclosed by brackets "{}"
+>       replaced with your own identifying information. (Don't include
+>       the brackets!)  The text should be enclosed in the appropriate
+>       comment syntax for the file format. We also recommend that a
+>       file or class name and description of purpose be included on the
+>       same "printed page" as the copyright notice for easier
+>       identification within third-party archives.
+>
+>    Copyright [year] [fullname]
+>
+>    Licensed under the Apache License, Version 2.0 (the "License");
+>    you may not use this file except in compliance with the License.
+>    You may obtain a copy of the License at
+>
+>        http://www.apache.org/licenses/LICENSE-2.0
+>
+>    Unless required by applicable law or agreed to in writing, software
+>    distributed under the License is distributed on an "AS IS" BASIS,
+>    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+>    See the License for the specific language governing permissions and
+>    limitations under the License.
+>
diff --git a/ext/CuYao/Project.toml b/ext/CuYao/Project.toml
new file mode 100644
index 00000000..88d577a0
--- /dev/null
+++ b/ext/CuYao/Project.toml
@@ -0,0 +1,27 @@
+name = "CuYao"
+uuid = "b48ca7a8-dd42-11e8-2b8e-1b7706800275"
+version = "0.3.9"
+
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
+Yao = "5872b779-8223-5990-8dd0-5abbb0748c8c"
+
+[compat]
+CUDA = "4, 5"
+Reexport = "0.2, 1"
+TupleTools = "1"
+Yao = "0.8"
+YaoBlocks = "0.13.10"
+julia = "1"
+
+[extras]
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+YaoBlocks = "418bc28f-b43b-5e0b-a6e7-61bbc1a2c1df"
+
+[targets]
+test = ["Test", "Statistics", "YaoBlocks"]
diff --git a/ext/CuYao/README.md b/ext/CuYao/README.md
new file mode 100644
index 00000000..4184db5c
--- /dev/null
+++ b/ext/CuYao/README.md
@@ -0,0 +1,86 @@
+**Build status**: [![][gitlab-img]][gitlab-url]
+
+[gitlab-img]: https://gitlab.com/JuliaGPU/CuYao.jl/badges/master/pipeline.svg
+[gitlab-url]: https://gitlab.com/JuliaGPU/CuYao.jl/pipelines
+
+CUDA support for [Yao.jl](https://github.com/QuantumBFS/Yao.jl).
+
+**Only tested locally, expect some adventures and rough edges.**
+
+## Installation
+
+<p>
+CuYao is a &nbsp;
+    <a href="https://julialang.org">
+        <img src="https://julialang.org/favicon.ico" width="16em">
+        Julia Language
+    </a>
+    &nbsp; package. It provides CUDA support for <a href="https://github.com/QuantumBFS/Yao.jl">Yao.jl</a>. To install CuYao,
+    please <a href="https://docs.julialang.org/en/v1/manual/getting-started/">open
+    Julia's interactive session (known as REPL)</a> and press <kbd>]</kbd> key in the REPL to use the package mode, then type the following command
+</p>
+
+For stable release
+
+```julia
+pkg> add CuYao
+```
+
+For current master
+
+```julia
+pkg> add CuYao#master
+```
+
+You don't need to install Yao if you have `CuYao` installed. They share the same API except CUDA backend.
+
+## Documentation
+
+For documentation of [Yao](https://github.com/QuantumBFS/Yao.jl), please refer to [documentation (stable)](https://quantumbfs.github.io/Yao.jl/stable).
+
+`CuYao.jl` provides only two extra APIs, `reg |> cu` to upload a register to GPU, and `cureg |> cpu` to download a register to CPU.
+
+To start, see the following example
+```julia
+using CuYao
+
+cureg = rand_state(9; nbatch=1000) |> cu   # or `curand_state(9; nbatch=1000)`.
+cureg |> put(9, 2=>Z)
+measure!(cureg |> append_qubits!(1) |> focus!(4,1,3))
+cureg |> relax!(4,1,3) |> cpu
+```
+
+Constructors `curand_state`, `cuzero_state`, `cuproduct_state`, `cuuniform_state` and `cughz_state` are tailored for GPU,
+they are faster than uploading a CPU register to CPU.
+
+## Features
+### Supported Gates
+
+- general U(N) gate
+- general U(1) gate
+- better X, Y, Z gate
+- better T, S gate
+- SWAP gate
+- better control gates
+- BP diff blocks
+
+### Supported Register Operations
+- measure!, measure_reset!, measure_remove!, select
+- append_qudits!, append_qubits!
+- insert_qudit!, insert_qubits!
+- focus!, relax!
+- join
+- density_matrix
+- fidelity (not including density matrix)
+- expect
+
+### Other Operations
+- autodiff is supported when the only parameterized gates are rotation gates in a circuit.
+
+## The Team
+
+This project is an effort of QuantumBFS, an open source organization for quantum science. Yao is currently maintained by [Xiu-Zhe (Roger) Luo](https://github.com/Roger-luo) and [Jin-Guo Liu](https://github.com/GiggleLiu) with contributions from open source community. All the contributors are listed in the [contributors](https://github.com/QuantumBFS/Yao.jl/graphs/contributors).
+
+## License
+
+**CuYao** is released under the Apache 2 license.
diff --git a/ext/CuYao/benchmarks/TestCuda.jl b/ext/CuYao/benchmarks/TestCuda.jl
new file mode 100644
index 00000000..8c6a6cd5
--- /dev/null
+++ b/ext/CuYao/benchmarks/TestCuda.jl
@@ -0,0 +1,54 @@
+using CUDA
+using LinearAlgebra
+using BenchmarkTools
+
+function ms!(X::CuArray, s::Number)
+    function kernel(X, s)
+        i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+        @inbounds X[i] *= s
+        return
+    end
+    @cuda blocks=length(X) kernel(X, s)
+    X
+end
+
+function ms1!(X::CuArray, a, b, c)
+    k = f(a)
+    function kernel(X, a, b, c)
+        i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+        k(X, i)
+        k(X, i)
+        k(X, i)
+        #@inbounds X[i] *= a
+        #@inbounds X[i] *= b
+        #@inbounds X[i] *= c
+        return
+    end
+    @cuda blocks=length(X)÷256 threads=256 kernel(X, a, b, c)
+    X
+end
+
+@inline function f(s)
+    @inline function kernel(X, i)
+        X[i]*=s
+    end
+end
+function ms2!(X::CuArray, s::Number)
+    function kernel(X, s)
+        i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+        @inbounds X[i] *= s
+        return
+    end
+    @cuda blocks=length(X)÷256 threads=256 kernel(X, s)
+    X
+end
+
+a = randn(1<<10)
+cua = cu(a)
+@benchmark ms1!($cua, 1.001, 1.001, 1.001)
+@benchmark ms2!(ms2!(ms2!($cua, 1.001), 1.001), 1.001)
+bss = 1:length(cua)
+@benchmark f(1.001).(cua, bss)
+
+@benchmark rmul!($a, 1.01)
+@benchmark ms!($cua, 1.0)
diff --git a/ext/CuYao/benchmarks/gates.jl b/ext/CuYao/benchmarks/gates.jl
new file mode 100644
index 00000000..b63dbd49
--- /dev/null
+++ b/ext/CuYao/benchmarks/gates.jl
@@ -0,0 +1,19 @@
+using Yao, CuYao, CUDA
+using BenchmarkTools
+
+reg = rand_state(12; nbatch=1000)
+creg = reg |> cu
+@benchmark CUDA.@sync creg |> put(12, 3=>Z)
+@benchmark CUDA.@sync creg |> put(12, 3=>X)
+@benchmark reg |> put(12, 3=>Z)
+@benchmark CUDA.@sync creg |> control(12, 6, 3=>X)
+@benchmark reg |> control(12, 6, 3=>X)
+@benchmark CUDA.@sync creg |> put(12, 3=>rot(X, 0.3))
+@benchmark reg |> put(12, 3=>rot(X, 0.3))
+
+reg = rand_state(20)
+creg = reg |> cu
+g = swap(20, 7, 2)
+g = put(20, (7, 2)=>matblock(rand_unitary(4)))
+@benchmark reg |> g
+@benchmark CUDA.@sync creg |> g
diff --git a/ext/CuYao/benchmarks/gcompile.jl b/ext/CuYao/benchmarks/gcompile.jl
new file mode 100644
index 00000000..62a90ae9
--- /dev/null
+++ b/ext/CuYao/benchmarks/gcompile.jl
@@ -0,0 +1,92 @@
+using Yao, Yao.Boost, Yao.Intrinsics, StaticArrays, Yao.Blocks
+using CuYao, CUDA
+using BenchmarkTools, Profile
+
+nbit = 12
+c = chain(put(nbit, 2=>X), put(nbit, 2=>rot(X, 0.2)), control(nbit, 3, 2=>rot(X,0.3)))
+#c = chain(c..., c...,c...)
+cc = c |> KernelCompiled
+reg = rand_state(nbit) |> cu
+
+@benchmark $reg |> copy |> $c[1] seconds = 2
+@benchmark $reg |> copy |> $cc seconds = 2
+
+
+reg = rand_state(9, 1000)
+creg = reg |> cu
+@benchmark focus!(reg |> copy, 1) do r
+    measure!(r)
+    r
+end
+
+@benchmark focus!(creg|>copy, 1) do r
+    measure!(r)
+    r
+end
+
+@benchmark focus!(creg, 1) do r
+    measure_reset!(r)
+    r
+end
+
+@profile for i=1:10 focus!(creg |> copy, 1) do r
+    measure!(r)
+    r
+end
+end
+
+a = randn(1<<10)
+ca = a |> cu
+
+gpu_call(ca, (x->x^2, ca)) do state, f, ca
+    ilin = linear_index(state)
+    ca[ilin] = f(ca[ilin])
+    return
+end
+
+gpu_call(1:1<<10, (x->x^2, ca)) do state, f, ca
+    ilin = linear_index(state)
+    ca[ilin] = f(ca[ilin])
+    return
+end
+
+
+function xx_kernel(bits::Ints)
+    ctrl = controller(bits[1], 0)
+    mask = bmask(bits...)
+    function kernel(state, inds)
+        i = inds[1]
+        b = i-1
+        ctrl(b) && swaprows!(piecewise(state, inds), i, flip(b, mask) + 1)
+    end
+end
+
+using CuYao: x_kernel, y_kernel, z_kernel
+fx = x_kernel((3,4))
+fy = y_kernel(3)
+fz = z_kernel((3,4))
+
+a = randn(1<<10)
+ca = a |> cu
+fs = (fx, fz)
+
+@benchmark gpu_call(ca, (fs, ca)) do state, f, ca
+    #ilin = linear_index(state)
+    ilin = @cartesianidx ca
+    for fi in fs
+        fi(ca, ilin)
+    end
+    return
+end
+
+@device_code_warntype gpu_call(ca, ((fx, fz), ca)) do state, fs, ca
+    ilin = @cartesianidx ca
+    #for fi in fs fi(ca, ilin) end
+    @Base.Cartesian.@nexprs $(length(fs)) i->fs[i](ca, ilin)
+    #fs(ca, ilin)
+    return
+end
+
+a ≈ Vector(ca)
+
+@edit controller(3,0)
diff --git a/ext/CuYao/benchmarks/paralleldot.jl b/ext/CuYao/benchmarks/paralleldot.jl
new file mode 100644
index 00000000..38e5dd6e
--- /dev/null
+++ b/ext/CuYao/benchmarks/paralleldot.jl
@@ -0,0 +1,11 @@
+function paralleldot(matrices::CuVector, ptrA, ptrB)
+    @inline function kernel(ctx, matrices)
+        inds = @cartesianidx state
+        i = inds[1]
+        piecewise(state, inds)[i] *= anyone(i-1, mask) ? d : a
+        return
+    end
+    gpu_call(kernel, state, a, d, mask; elements=length(state))
+    return state
+end
+
diff --git a/ext/CuYao/benchmarks/statexpect.jl b/ext/CuYao/benchmarks/statexpect.jl
new file mode 100644
index 00000000..9e3ca5dd
--- /dev/null
+++ b/ext/CuYao/benchmarks/statexpect.jl
@@ -0,0 +1,13 @@
+using CuYao
+using Yao
+using BenchmarkTools
+
+sf(x, y) = abs(x-y)
+a = randn(1024)
+ca = a |> cu
+b = randn(1024)
+cb = b |> cu
+@benchmark expect(StatFunctional{2}(sf), a, b) seconds=2
+@benchmark expect(StatFunctional{2}(sf), a) seconds=2
+@benchmark expect(StatFunctional{2}(sf), ca, cb) seconds=2
+@benchmark expect(StatFunctional{2}(sf), ca) seconds=2
diff --git a/ext/CuYao/src/CUDApatch.jl b/ext/CuYao/src/CUDApatch.jl
new file mode 100644
index 00000000..aeaf7d36
--- /dev/null
+++ b/ext/CuYao/src/CUDApatch.jl
@@ -0,0 +1,64 @@
+# TODO
+# support norm(view(reshape(A, m, n), :, 1))
+norm2(A::DenseCuArray; dims=1) = mapreduce(abs2, +, A, dims=dims) .|> CUDA.sqrt
+
+piecewise(state::AbstractVector, inds) = state
+piecewise(state::AbstractMatrix, inds) = @inbounds view(state,:,inds[2])
+
+function kron(A::DenseCuArray{T1}, B::DenseCuArray{T2}) where {T1, T2}
+    res = CUDA.zeros(promote_type(T1,T2), (size(A).*size(B))...)
+    @inline function kernel(ctx, res, A, B)
+        state = @linearidx res
+        @inbounds inds = CartesianIndices(res)[state].I
+        inds_A = (inds.-1) .÷ size(B) .+ 1
+        inds_B = (inds.-1) .% size(B) .+ 1
+        @inbounds res[state] = A[inds_A...]*B[inds_B...]
+        return
+    end
+
+    gpu_call(kernel, res, A, B)
+    res
+end
+
+"""
+    kron!(C::CuArray, A, B)
+
+Computes Kronecker products in-place on the GPU.
+The results are stored in 'C', overwriting the existing values of 'C'.
+"""
+function Yao.YaoArrayRegister.kron!(C::CuArray{T3}, A::DenseCuArray{T1}, B::DenseCuArray{T2}) where {T1, T2, T3}
+    @boundscheck (size(C) == (size(A,1)*size(B,1), size(A,2)*size(B,2))) || throw(DimensionMismatch())
+    CI = Base.CartesianIndices(C)
+    @inline function kernel(ctx, C, A, B)
+        state = @linearidx C
+        @inbounds inds = CI[state].I
+        inds_A = (inds.-1) .÷ size(B) .+ 1
+        inds_B = (inds.-1) .% size(B) .+ 1
+        @inbounds C[state] = A[inds_A...]*B[inds_B...]
+        return
+    end
+
+    gpu_call(kernel, C, A, B)
+    C
+end
+
+function getindex(A::DenseCuVector{T}, B::DenseCuArray{<:Integer}) where T
+    res = CUDA.zeros(T, size(B)...)
+    @inline function kernel(ctx, res, A, B)
+        state = @linearidx res
+        @inbounds res[state] = A[B[state]]
+        return
+    end
+    gpu_call(kernel, res, A, B)
+    res
+end
+
+function getindex(A::AbstractVector, B::DenseCuArray{<:Integer})
+    A[Array(B)]
+end
+
+YaoBlocks.AD.as_scalar(x::DenseCuArray) = Array(x)[]
+
+# patch for ExponentialUtilities
+YaoBlocks.compatible_multiplicative_operand(::CuArray, source::AbstractArray) = CuArray(source)
+YaoBlocks.compatible_multiplicative_operand(::CuArray, source::CuArray) = source
diff --git a/ext/CuYao/src/CuYao.jl b/ext/CuYao/src/CuYao.jl
new file mode 100644
index 00000000..cc64a45e
--- /dev/null
+++ b/ext/CuYao/src/CuYao.jl
@@ -0,0 +1,39 @@
+# module CuYao
+using Yao.YaoArrayRegister.LuxurySparse, Yao.YaoArrayRegister.StaticArrays, LinearAlgebra, Base.Cartesian
+using Yao.YaoArrayRegister.StatsBase, Yao.YaoArrayRegister.BitBasis
+using Yao.Reexport
+import Yao.YaoArrayRegister.TupleTools
+using Yao.YaoArrayRegister.Random
+
+using Yao.YaoArrayRegister
+using Yao
+using CUDA
+using CUDA.GPUArrays: gpu_call, @linearidx, @cartesianidx, linear_index
+using Yao.YaoArrayRegister
+using Yao.YaoBlocks
+using Yao.ConstGate: SWAPGate
+using Yao.ConstGate: S, T, Sdag, Tdag
+
+import Yao.YaoArrayRegister: insert_qudits!, join
+import CUDA: cu
+import Yao.YaoArrayRegister: _measure, measure, measure!
+import Yao.YaoArrayRegister: batch_normalize!
+import Yao.YaoBlocks: BlockedBasis, nblocks, subblock
+import Yao: expect
+import Yao.YaoArrayRegister: u1rows!, unrows!, autostatic, instruct!, swaprows!, mulrow!
+import Yao.YaoArrayRegister.LinearAlgebra: norm
+import Base: kron, getindex
+
+export cpu, cu, AbstractCuArrayReg, CuArrayReg, CuBatchedArrayReg, CuDensityMatrix,
+    cuzero_state, cuuniform_state, curand_state, cuproduct_state, cughz_state
+
+const Ints = NTuple{<:Any, Int}
+
+include("CUDApatch.jl")
+include("register.jl")
+include("instructs.jl")
+
+function __init__()
+    CUDA.allowscalar(false)
+end
+# end
diff --git a/ext/CuYao/src/instructs.jl b/ext/CuYao/src/instructs.jl
new file mode 100644
index 00000000..6ed8e35d
--- /dev/null
+++ b/ext/CuYao/src/instructs.jl
@@ -0,0 +1,312 @@
+# get index
+macro idx(shape, grididx=1, ctxsym=:ctx)
+    quote
+        x = $(esc(shape))
+        i = $linear_index($(esc(ctxsym)), $(esc(grididx)))
+        i > $prod2(x) && return
+        @inbounds Base.CartesianIndices(x)[i].I
+    end
+end
+replace_first(x::NTuple{2}, v) = (v, x[2])
+replace_first(x::NTuple{1}, v) = (v,)
+prod2(x::NTuple{2}) = x[1] * x[2]
+prod2(x::NTuple{1}) = x[1]
+
+gpu_compatible(A::AbstractVecOrMat) = A |> staticize
+gpu_compatible(A::StaticArray) = A
+
+###################### unapply! ############################
+function instruct!(::Val{2}, state::DenseCuVecOrMat, U0::AbstractMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M}
+    @debug "The generic U(N) matrix of size ($(size(U0))), on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
+    nbit = log2dim1(state)
+    # reorder a unirary matrix.
+    U = gpu_compatible(all(TupleTools.diff(locs).>0) ? U0 : reorder(U0, collect(locs)|>sortperm))
+    locked_bits = [clocs..., locs...]
+    locked_vals = [cvals..., zeros(Int, M)...]
+    locs_raw = gpu_compatible([i+1 for i in itercontrol(nbit, setdiff(1:nbit, locs), zeros(Int, nbit-M))])
+    configs = itercontrol(nbit, locked_bits, locked_vals)
+
+    len = length(configs)
+    @inline function kernel(ctx, state, locs_raw, U, configs, len)
+        CUDA.assume(len > 0)
+        sz = size(state)
+        CUDA.assume(length(sz) == 1 || length(sz) == 2)
+        inds = @idx replace_first(sz, len)
+        x = @inbounds configs[inds[1]]
+        @inbounds unrows!(piecewise(state, inds), x .+ locs_raw, U)
+        return
+    end
+
+    @inline function kernel_single_entry_diag(ctx, state, loc, val, configs, len)
+        CUDA.assume(len > 0)
+        sz = size(state)
+        CUDA.assume(length(sz) == 1 || length(sz) == 2)
+        inds = @idx replace_first(sz, len)
+        x = @inbounds configs[inds[1]]
+        @inbounds piecewise(state, inds)[x + loc] *= val
+        return
+    end
+
+    elements = len*size(state,2)
+    if U isa Diagonal && count(!isone, U.diag) == 1
+        @debug "The single entry diagonal matrix, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
+        k = findfirst(!isone, U.diag)
+        loc = locs_raw[k]
+        val = U.diag[k]
+        gpu_call(kernel_single_entry_diag, state, loc, val, configs, len; elements)
+    else
+        gpu_call(kernel, state, locs_raw, U, configs, len; elements)
+    end
+    state
+end
+instruct!(::Val{2}, state::DenseCuVecOrMat, U0::IMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = state
+instruct!(::Val{2}, state::DenseCuVecOrMat, U0::SDSparseMatrixCSC, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = instruct!(Val(2), state, U0 |> Matrix, locs, clocs, cvals)
+
+################## General U1 apply! ###################
+function YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U1::SDSparseMatrixCSC, loc::Int)
+    instruct!(Val(2), state, Matrix(U1), loc, clocs, cval)
+end
+function YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U1::AbstractMatrix, loc::Int)
+    @debug "The generic U(2) matrix of size ($(size(U1))), on: GPU, locations: $(loc)."
+    a, c, b, d = U1
+    nbit = log2dim1(state)
+    step = 1<<(loc-1)
+    configs = itercontrol(nbit, [loc], [0])
+
+    len = length(configs)
+    @inline function kernel(ctx, state, a, b, c, d, len)
+        inds = @idx replace_first(size(state), len)
+        i = @inbounds configs[inds[1]]+1
+        @inbounds u1rows!(piecewise(state, inds), i, i+step, a, b, c, d)
+        return
+    end
+    gpu_call(kernel, state, a, b, c, d, len; elements=len*size(state,2))
+    return state
+end
+
+function YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U1::SDPermMatrix, loc::Int)
+    @debug "The single qubit permutation matrix of size ($(size(U1))), on: GPU, locations: $(loc)."
+    nbit = log2dim1(state)
+    b, c = U1.vals
+    step = 1<<(loc-1)
+    configs = itercontrol(nbit, [loc], [0])
+
+    len = length(configs)
+    function kernel(ctx, state, b, c, step, len, configs)
+        inds = @idx replace_first(size(state), len)
+        x = @inbounds configs[inds[1]] + 1
+        @inbounds swaprows!(piecewise(state, inds), x, x+step, c, b)
+        return
+    end
+    gpu_call(kernel, state, b, c, step, len, configs; elements=len*size(state,2))
+    return state
+end
+
+function YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U1::SDDiagonal, loc::Int)
+    @debug "The single qubit diagonal matrix of size ($(size(U1))), on: GPU, locations: $(loc)."
+    a, d = U1.diag
+    nbit = log2dim1(state)
+    mask = bmask(loc)
+    @inline function kernel(ctx, state, a, d, mask)
+        inds = @cartesianidx state
+        i = inds[1]
+        piecewise(state, inds)[i] *= anyone(i-1, mask) ? d : a
+        return
+    end
+    gpu_call(kernel, state, a, d, mask; elements=length(state))
+    return state
+end
+
+YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U::IMatrix, loc::Int) = state
+
+################## XYZ #############
+
+_instruct!(state::DenseCuVecOrMat, ::Val{:X}, locs::NTuple{L,Int}) where {L} = _instruct!(state, Val(:X), locs, (), ())
+function _instruct!(state::DenseCuVecOrMat, ::Val{:X}, locs::NTuple{L,Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {L,C}
+    @debug "The X gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
+    length(locs) == 0 && return state
+    nbit = log2dim1(state)
+    configs = itercontrol(nbit, [clocs..., locs[1]], [cvals..., 0])
+    mask = bmask(locs...)
+    len = length(configs)
+    @inline function kernel(ctx, state, mask, len, configs)
+        inds = @idx replace_first(size(state), len)
+        b = @inbounds configs[inds[1]]
+        @inbounds swaprows!(piecewise(state, inds), b+1, flip(b, mask) + 1)
+        return
+    end
+    gpu_call(kernel, state, mask, len, configs; elements=len*size(state,2))
+    return state
+end
+
+function _instruct!(state::DenseCuVecOrMat, ::Val{:Y}, locs::NTuple{C,Int}) where C
+    @debug "The Y gate, on: GPU, locations: $(locs)."
+    length(locs) == 0 && return state
+    nbit = log2dim1(state)
+    mask = bmask(Int, locs...)
+    configs = itercontrol(nbit, [locs[1]], [0])
+    bit_parity = length(locs)%2 == 0 ? 1 : -1
+    factor = (-im)^length(locs)
+    len = length(configs)
+    @inline function kernel(ctx, state, mask, bit_parity, configs, len)
+        inds = @idx replace_first(size(state), len)
+        b = @inbounds configs[inds[1]]
+        i_ = flip(b, mask) + 1
+        factor1 = count_ones(b&mask)%2 == 1 ? -factor : factor
+        factor2 = factor1*bit_parity
+        @inbounds swaprows!(piecewise(state, inds), b+1, i_, factor2, factor1)
+        return
+    end
+    gpu_call(kernel, state, mask, bit_parity, configs, len; elements=len*size(state,2))
+    return state
+end
+
+function _instruct!(state::DenseCuVecOrMat, ::Val{:Y}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
+    @debug "The Y gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
+    length(locs) == 0 && return state
+    nbit = log2dim1(state)
+    configs = itercontrol(nbit, [clocs..., locs...], [cvals..., 0])
+    mask = bmask(locs...)
+    len = length(configs)
+    @inline function kernel(ctx, state, configs, mask, len)
+        inds = @idx replace_first(size(state), len)
+        b = @inbounds configs[inds[1]]
+        @inbounds swaprows!(piecewise(state, inds), b+1, flip(b, mask) + 1, im, -im)
+        return
+    end
+    gpu_call(kernel, state, configs, mask, len; elements=len*size(state,2))
+    return state
+end
+
+function _instruct!(state::DenseCuVecOrMat, ::Val{:Z}, locs::NTuple{C,Int}) where C
+    @debug "The Z gate, on: GPU, locations: $(locs)."
+    length(locs) == 0 && return state
+    nbit = log2dim1(state)
+    mask = bmask(locs...)
+    @inline function kernel(ctx, state, mask)
+        inds = @cartesianidx state
+        i = inds[1]
+        piecewise(state, inds)[i] *= count_ones((i-1)&mask)%2==1 ? -1 : 1
+        return
+    end
+    gpu_call(kernel, state, mask; elements=length(state))
+    return state
+end
+
+
+for (G, FACTOR) in zip([:Z, :S, :T, :Sdag, :Tdag], [:(-one(Int32)), :(1f0im), :($(exp(im*π/4))), :(-1f0im), :($(exp(-im*π/4)))])
+    if G !== :Z
+        @eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C
+            @debug "The $($(QuoteNode(G))) gate, on: GPU, locations: $(locs)."
+            length(locs) == 0 && return state
+            nbit = log2dim1(state)
+            mask = bmask(Int32, locs...)
+            @inline function kernel(ctx, state, mask)
+                inds = @cartesianidx state
+                i = inds[1]
+                piecewise(state, inds)[i] *= $FACTOR ^ count_ones(Int32(i-1)&mask)
+                return
+            end
+            gpu_call(kernel, state, mask; elements=length(state))
+            return state
+        end
+    end
+    @eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
+        @debug "The $($(QuoteNode(G))) gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
+        ctrl = controller((clocs..., locs...), (cvals..., 1))
+        @inline function kernel(ctx, state, ctrl)
+            inds = @cartesianidx state
+            i = inds[1]
+            piecewise(state, inds)[i] *= ctrl(i-1) ? $FACTOR : one($FACTOR)
+            return
+        end
+        gpu_call(kernel, state, ctrl; elements=length(state))
+        return state
+    end
+end
+
+for G in [:X, :Y, :Z, :S, :T, :Sdag, :Tdag]
+    @eval begin
+        function YaoArrayRegister.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C
+            _instruct!(state, g, locs)
+        end
+
+        function YaoArrayRegister.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::Tuple{Int})
+            _instruct!(state, g, locs)
+        end
+
+        function YaoArrayRegister.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
+            _instruct!(state, g, locs, clocs, cvals)
+        end
+
+        function YaoArrayRegister.instruct!(::Val{2}, state::DenseCuVecOrMat, vg::Val{$(QuoteNode(G))}, locs::Tuple{Int}, cloc::Tuple{Int}, cval::Tuple{Int})
+            _instruct!(state, vg, locs, cloc, cval)
+        end
+    end
+
+end
+
+function instruct!(::Val{2}, state::DenseCuVecOrMat, ::Val{:SWAP}, locs::Tuple{Int,Int})
+    @debug "The SWAP gate, on: GPU, locations: $(locs)."
+    b1, b2 = locs
+    mask1 = bmask(b1)
+    mask2 = bmask(b2)
+
+    configs = itercontrol(log2dim1(state), [locs...], [1,0])
+    function kf(ctx, state, mask1, mask2)
+        inds = @idx replace_first(size(state), length(configs))
+
+        b = configs[inds[1]]
+        i = b+1
+        i_ = b ⊻ (mask1|mask2) + 1
+        swaprows!(piecewise(state, inds), i, i_)
+        nothing
+    end
+    gpu_call(kf, state, mask1, mask2; elements=length(configs)*size(state,2))
+    state
+end
+
+############## other gates ################
+# parametrized swap gate
+
+function instruct!(::Val{2}, state::DenseCuVecOrMat, ::Val{:PSWAP}, locs::Tuple{Int, Int}, θ::Real)
+    @debug "The PSWAP gate, on: GPU, locations: $(locs)."
+    nbit = log2dim1(state)
+    mask1 = bmask(locs[1])
+    mask2 = bmask(locs[2])
+    mask12 = mask1|mask2
+    a, c, b_, d = mat(Rx(θ))
+    e = exp(-im/2*θ)
+    configs = itercontrol(nbit, [locs...], [0,0])
+    len = length(configs)
+    @inline function kernel(ctx, state, mask2, mask12, configs, a, b_, c, d)
+        inds = @idx replace_first(size(state), len)
+        @inbounds x = configs[inds[1]]
+        piecewise(state, inds)[x+1] *= e
+        piecewise(state, inds)[x⊻mask12+1] *= e
+        y = x ⊻ mask2
+        @inbounds u1rows!(piecewise(state, inds), y+1, y⊻mask12+1, a, b_, c, d)
+        return
+    end
+    gpu_call(kernel, state, mask2, mask12, configs, a, b_, c, d; elements=len*size(state,2))
+    state
+end
+
+function YaoBlocks._apply_fallback!(r::AbstractCuArrayReg{B,T}, b::AbstractBlock) where {B,T}
+    YaoBlocks._check_size(r, b)
+    r.state .= CUDA.adapt(CuArray{T}, mat(T, b)) * r.state
+    return r
+end
+
+for RG in [:Rx, :Ry, :Rz]
+    @eval function instruct!(
+            ::Val{2}, 
+            state::DenseCuVecOrMat{T},
+            ::Val{$(QuoteNode(RG))},
+            locs::Tuple{Int},
+            theta::Number
+        ) where {T}
+        YaoArrayRegister.instruct!(Val(2), state, Val($(QuoteNode(RG))), locs, (), (), theta)
+        return state
+    end
+end
diff --git a/ext/CuYao/src/register.jl b/ext/CuYao/src/register.jl
new file mode 100644
index 00000000..cc493e94
--- /dev/null
+++ b/ext/CuYao/src/register.jl
@@ -0,0 +1,290 @@
+cu(reg::ArrayReg{D}) where D = ArrayReg{D}(CuArray(reg.state))
+cpu(reg::ArrayReg{D}) where D = ArrayReg{D}(Array(reg.state))
+cu(reg::BatchedArrayReg{D}) where D = BatchedArrayReg{D}(CuArray(reg.state), reg.nbatch)
+cpu(reg::BatchedArrayReg{D}) where D = BatchedArrayReg{D}(Array(reg.state), reg.nbatch)
+cu(reg::DensityMatrix{D}) where D = DensityMatrix{D}(CuArray(reg.state))
+cpu(reg::DensityMatrix{D}) where D = DensityMatrix{D}(Array(reg.state))
+const AbstractCuArrayReg{D, T, MT} = AbstractArrayReg{D, T, MT} where MT<:DenseCuArray
+const CuArrayReg{D, T, MT} = ArrayReg{D, T, MT} where MT<:DenseCuArray
+const CuBatchedArrayReg{D, T, MT} = BatchedArrayReg{D, T, MT} where MT<:DenseCuArray
+const CuDensityMatrix{D, T, MT} = DensityMatrix{D, T, MT} where MT<:DenseCuMatrix
+
+function batch_normalize!(s::DenseCuArray, p::Real=2)
+    p!=2 && throw(ArgumentError("p must be 2!"))
+    s./=norm2(s, dims=1)
+    return s
+end
+
+@inline function tri2ij(l::Int)
+    i = ceil(Int, sqrt(2*l+0.25)-0.5)
+    j = l-i*(i-1)÷2
+    return i+1,j
+end
+
+############### MEASURE ##################
+function measure(::ComputationalBasis, reg::ArrayReg{D, T, MT} where MT<:DenseCuArray, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG, nshots::Int=1) where {D,T}
+    _measure(rng, basis(reg), reg |> probs |> Vector, nshots)
+end
+
+# TODO: optimize the batch dimension using parallel sampling
+function measure(::ComputationalBasis, reg::BatchedArrayReg{D, T, MT} where MT<:DenseCuArray, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG, nshots::Int=1) where {D,T}
+    regm = reg |> rank3
+    pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2)
+    return _measure(rng, basis(reg), pl |> Matrix, nshots)
+end
+
+function measure!(::RemoveMeasured, ::ComputationalBasis, reg::AbstractCuArrayReg{D}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where D
+    regm = reg |> rank3
+    B = size(regm, 3)
+    nregm = similar(regm, D ^ nremain(reg), B)
+    pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2)
+    pl_cpu = pl |> Matrix
+    res_cpu = map(ib->_measure(rng, basis(reg), view(pl_cpu, :, ib), 1)[], 1:B)
+    res = CuArray(res_cpu)
+    CI = Base.CartesianIndices(nregm)
+    @inline function kernel(ctx, nregm, regm, res, pl)
+        state = @linearidx nregm
+        @inbounds i,j = CI[state].I
+        @inbounds r = Int(res[j])+1
+        @inbounds nregm[i,j] = regm[r,i,j]/CUDA.sqrt(pl[r, j])
+        return
+    end
+    gpu_call(kernel, nregm, regm, res, pl)
+    reg.state = reshape(nregm,1,:)
+    return reg isa ArrayReg ? Array(res)[] : res
+end
+
+function measure!(::NoPostProcess, ::ComputationalBasis, reg::AbstractCuArrayReg{D, T}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where {D, T}
+    regm = reg |> rank3
+    B = size(regm, 3)
+    pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2)
+    pl_cpu = pl |> Matrix
+    res_cpu = map(ib->_measure(rng, basis(reg), view(pl_cpu, :, ib), 1)[], 1:B)
+    res = CuArray(res_cpu)
+    CI = Base.CartesianIndices(regm)
+
+    @inline function kernel(ctx, regm, res, pl)
+        state = @linearidx regm
+        @inbounds k,i,j = CI[state].I
+        @inbounds rind = Int(res[j]) + 1
+        @inbounds regm[k,i,j] = k==rind ? regm[k,i,j]/CUDA.sqrt(pl[k, j]) : T(0)
+        return
+    end
+    gpu_call(kernel, regm, res, pl)
+    return reg isa ArrayReg ? Array(res)[] : res
+end
+
+function YaoArrayRegister.measure!(
+    ::NoPostProcess,
+    bb::BlockedBasis,
+    reg::AbstractCuArrayReg{D,T},
+    ::AllLocs;
+    rng::AbstractRNG = Random.GLOBAL_RNG,
+) where {D,T}
+    state = @inbounds (reg|>rank3)[bb.perm, :, :]  # permute to make eigen values sorted
+    B = size(state, 3)
+    pl = dropdims(mapreduce(abs2, +, state, dims=2), dims=2)
+    pl_cpu = pl |> Matrix
+    pl_block = zeros(eltype(pl), nblocks(bb), B)
+    @inbounds for ib = 1:B
+        for i = 1:nblocks(bb)
+            for k in subblock(bb, i)
+                pl_block[i, ib] += pl_cpu[k, ib]
+            end
+        end
+    end
+    # perform measurements on CPU
+    res_cpu = Vector{Int}(undef, B)
+    @inbounds @views for ib = 1:B
+        ires = sample(rng, 1:nblocks(bb), Weights(pl_block[:, ib]))
+        # notice ires is `BitStr` type, can be use as indices directly.
+        range = subblock(bb, ires)
+        state[range, :, ib] ./= sqrt(pl_block[ires, ib])
+        state[1:range.start-1, :, ib] .= zero(T)
+        state[range.stop+1:size(state, 1), :, ib] .= zero(T)
+        res_cpu[ib] = ires
+    end
+    # undo permute and assign back
+    _state = reshape(state, 1 << nactive(reg), :)
+    rstate = reshape(reg.state, 1 << nactive(reg), :)
+    @inbounds rstate[bb.perm, :] .= _state
+    return reg isa ArrayReg ? bb.values[res_cpu[]] : CuArray(bb.values[res_cpu])
+end
+
+function measure!(rst::ResetTo, ::ComputationalBasis, reg::AbstractCuArrayReg{D, T}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where {D, T}
+    regm = reg |> rank3
+    B = size(regm, 3)
+    pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2)
+    pl_cpu = pl |> Matrix
+    res_cpu = map(ib->_measure(rng, basis(reg), view(pl_cpu, :, ib), 1)[], 1:B)
+    res = CuArray(res_cpu)
+    CI = Base.CartesianIndices(regm)
+
+    @inline function kernel(ctx, regm, res, pl, val)
+        state = @linearidx regm
+        @inbounds k,i,j = CI[state].I
+        @inbounds rind = Int(res[j]) + 1
+        @inbounds k==val+1 && (regm[k,i,j] = regm[rind,i,j]/CUDA.sqrt(pl[rind, j]))
+        CUDA.sync_threads()
+        @inbounds k!=val+1 && (regm[k,i,j] = 0)
+        return
+    end
+
+    gpu_call(kernel, regm, res, pl, rst.x)
+    return reg isa ArrayReg ? Array(res)[] : res
+end
+
+function YaoArrayRegister.batched_kron(A::DenseCuArray{T1}, B::DenseCuArray{T2}) where {T1 ,T2}
+    res = CUDA.zeros(promote_type(T1,T2), size(A,1)*size(B, 1), size(A,2)*size(B,2), size(A, 3))
+    CI = Base.CartesianIndices(res)
+    @inline function kernel(ctx, res, A, B)
+        state = @linearidx res
+        @inbounds i,j,b = CI[state].I
+        i_A, i_B = divrem((i-1), size(B,1))
+        j_A, j_B = divrem((j-1), size(B,2))
+        @inbounds res[state] = A[i_A+1, j_A+1, b]*B[i_B+1, j_B+1, b]
+        return
+    end
+
+    gpu_call(kernel, res, A, B)
+    return res
+end
+
+"""
+    YaoArrayRegister.batched_kron!(C::CuArray, A, B)
+
+Performs batched Kronecker products in-place on the GPU.
+The results are stored in 'C', overwriting the existing values of 'C'.
+"""
+function YaoArrayRegister.batched_kron!(C::CuArray{T3, 3}, A::DenseCuArray, B::DenseCuArray) where {T3}
+    @boundscheck (size(C) == (size(A,1)*size(B,1), size(A,2)*size(B,2), size(A,3))) || throw(DimensionMismatch())
+    @boundscheck (size(A,3) == size(B,3) == size(C,3)) || throw(DimensionMismatch())
+    CI = Base.CartesianIndices(C)
+    @inline function kernel(ctx, C, A, B)
+        state = @linearidx C
+        @inbounds i,j,b = CI[state].I
+        i_A, i_B = divrem((i-1), size(B,1))
+        j_A, j_B = divrem((j-1), size(B,2))
+        @inbounds C[state] = A[i_A+1, j_A+1, b]*B[i_B+1, j_B+1, b]
+        return
+    end
+
+    gpu_call(kernel, C, A, B)
+    return C
+end
+
+function join(reg1::AbstractCuArrayReg{D}, reg2::AbstractCuArrayReg{D}) where {D}
+    @assert nbatch(reg1) == nbatch(reg2)
+    s1 = reg1 |> rank3
+    s2 = reg2 |> rank3
+    state = YaoArrayRegister.batched_kron(s1, s2)
+    return arrayreg(copy(reshape(state, size(state, 1), :)); nlevel=D, nbatch=nbatch(reg1))
+end
+
+function Yao.insert_qudits!(reg::AbstractCuArrayReg{D}, loc::Int; nqudits::Int=1) where D
+    na = nactive(reg)
+    focus!(reg, 1:loc-1)
+    reg2 = join(zero_state(nqudits; nbatch=nbatch(reg)) |> cu, reg) |> relax! |> focus!((1:na+nqudits)...)
+    reg.state = reg2.state
+    return reg
+end
+
+"""
+    cuproduct_state([T=ComplexF64], total::Int, bit_config::Integer; nbatch=NoBatch())
+
+The GPU version of [`product_state`](@ref).
+"""
+cuproduct_state(bit_str::BitStr; nbatch::Union{NoBatch,Int} = NoBatch()) =
+    cuproduct_state(ComplexF64, bit_str; nbatch = nbatch)
+cuproduct_state(bit_str::AbstractVector; nbatch::Union{NoBatch,Int} = NoBatch()) =
+    cuproduct_state(ComplexF64, bit_str; nbatch = nbatch)
+cuproduct_state(total::Int, bit_config::Integer; kwargs...) =
+    cuproduct_state(ComplexF64, total, bit_config; kwargs...)
+cuproduct_state(::Type{T}, bit_str::BitStr{N}; kwargs...) where {T,N} =
+    cuproduct_state(T, N, buffer(bit_str); kwargs...)
+cuproduct_state(::Type{T}, bit_configs::AbstractVector; kwargs...) where {T} =
+    cuproduct_state(T, bit_literal(bit_configs...); kwargs...)
+function cuproduct_state(
+    ::Type{T},
+    total::Int,
+    bit_config::Integer;
+    nbatch::Union{Int,NoBatch} = NoBatch(),
+    nlevel::Int=2,
+) where {T}
+    raw = CUDA.zeros(T, nlevel ^ total, YaoArrayRegister._asint(nbatch))
+    raw[Int(bit_config)+1,:] .= Ref(one(T))
+    return arrayreg(raw; nbatch=nbatch, nlevel=nlevel)
+end
+
+cuzero_state(n::Int; kwargs...) = cuzero_state(ComplexF64, n; kwargs...)
+cuzero_state(::Type{T}, n::Int; kwargs...) where {T} = cuproduct_state(T, n, 0; kwargs...)
+
+"""
+    curand_state([T=ComplexF64], n::Int; nbatch=1)
+
+The GPU version of [`rand_state`](@ref).
+"""
+curand_state(n::Int; kwargs...) = curand_state(ComplexF64, n; kwargs...)
+
+function curand_state(
+    ::Type{T},
+    n::Int;
+    nbatch::Union{Int,NoBatch} = NoBatch(),
+    nlevel = 2,
+) where {T}
+    raw = CUDA.randn(T, nlevel ^ n, YaoArrayRegister._asint(nbatch))
+    return normalize!(arrayreg(raw; nbatch=nbatch, nlevel=nlevel))
+end
+
+"""
+    cuuniform_state([T=ComplexF64], n::Int; nbatch=1)
+
+The GPU version of [`uniform_state`](@ref).
+"""
+cuuniform_state(n::Int; kwargs...) = cuuniform_state(ComplexF64, n; kwargs...)
+function cuuniform_state(::Type{T}, n::Int;
+    nbatch::Union{Int,NoBatch} = NoBatch(),
+    nlevel::Int = 2,
+) where {T}
+    raw = CUDA.ones(T, nlevel ^ n, YaoArrayRegister._asint(nbatch))
+    return normalize!(arrayreg(raw; nbatch=nbatch, nlevel=nlevel))
+end
+
+"""
+    cughz_state([T=ComplexF64], n::Int; nbatch=1)
+
+The GPU version of [`ghz_state`](@ref).
+"""
+cughz_state(n::Int; kwargs...) = cughz_state(ComplexF64, n; kwargs...)
+function cughz_state(::Type{T}, n::Int; kwargs...) where {T}
+    reg = cuzero_state(T, n; kwargs...)
+    reg.state[1:1,:] .= Ref(sqrt(T(0.5)))
+    reg.state[end:end,:] .= Ref(sqrt(T(0.5)))
+    return reg
+end
+
+#=
+for FUNC in [:measure!, :measure!]
+    @eval function $FUNC(rng::AbstractRNG, op::AbstractBlock, reg::AbstractCuArrayReg, al::AllLocs; kwargs...) where B
+        E, V = eigen!(mat(op) |> Matrix)
+        ei = Eigen(E|>cu, V|>cu)
+        $FUNC(rng::AbstractRNG, ei, reg, al; kwargs...)
+    end
+end
+=#
+
+function YaoBlocks.expect(op::AbstractAdd, dm::CuDensityMatrix)
+    sum(x->expect(x, dm), subblocks(op))
+end
+function YaoBlocks.expect(op::AbstractBlock, dm::CuDensityMatrix{D}) where D
+    return tr(apply(ArrayReg{D}(dm.state), op).state)
+end
+
+measure(
+    ::ComputationalBasis,
+    reg::CuDensityMatrix,
+    ::AllLocs;
+    nshots::Int = 1,
+    rng::AbstractRNG = Random.GLOBAL_RNG,
+) = YaoArrayRegister._measure(rng, basis(reg), Array(reg |> probs), nshots)
+
diff --git a/ext/CuYao/test/CUDApatch.jl b/ext/CuYao/test/CUDApatch.jl
new file mode 100644
index 00000000..b00adb96
--- /dev/null
+++ b/ext/CuYao/test/CUDApatch.jl
@@ -0,0 +1,35 @@
+using CuYao
+using CUDA
+using Test
+using Yao.YaoBlocks
+
+@testset "isapprox-complex" begin
+    ca = CuArray(randn(ComplexF64,3,3))
+    cb = copy(ca)
+    #@test ca ≈ cb still error!
+    cb[1:1, 1:1] .+= 1e-7im
+    @test isapprox(ca, cb, atol=1e-5)
+    @test !isapprox(ca, cb, atol=1e-9)
+end
+
+@testset "view general" begin
+    a = randn(5,6,8) |> CuArray
+    @test view(a, 2:4, 4, [1,4,3]) |> size == (3, 3)
+end
+
+@testset "permutedims vector" begin
+    ca = randn(ComplexF64,3,4,5,1)
+    @test permutedims(CuArray(ca), [2,1,4,3]) |> Array ≈ permutedims(ca, [2,1,4,3])
+end
+
+@testset "Complex pow" begin
+    for T in [ComplexF64, ComplexF32]
+        a = CuArray(randn(T, 4, 4))
+        @test Array(a .^ Int32(3)) ≈ Array(a).^3
+        @test Array(a .^ real(T)(3)) ≈ Array(a).^3
+    end
+end
+
+@testset "as_scalar" begin
+    @test YaoBlocks.AD.as_scalar([2.0] |> CuArray) == 2.0
+end
diff --git a/ext/CuYao/test/extra.jl b/ext/CuYao/test/extra.jl
new file mode 100644
index 00000000..739ee6cb
--- /dev/null
+++ b/ext/CuYao/test/extra.jl
@@ -0,0 +1,52 @@
+using CuYao, Test, CUDA
+CUDA.allowscalar(false)
+
+@testset "gradient" begin
+    reg1 = rand_state(10)
+    reg2 = rand_state(10)
+    c = EasyBuild.variational_circuit(10)
+    g1, g2 = fidelity'(reg1=>c, reg2)
+    cg1, cg2 = fidelity'(cu(reg1)=>c, cu(reg2))
+    @test g1.first ≈ cpu(cg1.first)
+    @test g1.second ≈ cg1.second
+    @test g2 ≈ cpu(cg2)
+
+    h = EasyBuild.heisenberg(10)
+    g1 = expect'(h, reg1=>c)
+    cg1 = expect'(h, cu(reg1)=>c)
+    @test g1.first ≈ cpu(cg1.first)
+    @test g1.second ≈ cg1.second
+end
+
+@testset "apply density matrix" begin
+    reg = rand_state(6)
+    creg = cu(reg)
+    rho = density_matrix(reg, (3,4))
+    crho = density_matrix(creg, (3,4))
+    @test rho ≈ cpu(crho)
+    g = put(2, 1=>Rx(0.3))
+    @test cpu(apply(crho, g)) ≈ apply(rho, g)
+    rho = density_matrix(reg)
+    crho = density_matrix(creg)
+    @test rho ≈ cpu(crho)
+    @test probs(crho) ≈ probs(creg)
+    g = put(6, 1=>Rx(0.3))
+    @test cpu(apply(crho, g)) ≈ apply(rho, g)
+
+    # channel
+    c = UnitaryChannel([put(6, 1=>Rx(0.3)), put(6, 2=>Z)], [0.4, 0.6])
+    @test cpu(apply(crho, c)) ≈ apply(rho, c)
+end
+
+@testset "expect on density matrix" begin
+    reg = rand_state(6)
+    rho = density_matrix(reg, (3,4,5))
+    crho = cu(rho)
+    h = EasyBuild.heisenberg(3)
+    a = expect(h, rho)
+    b = expect(h, crho)
+    @test a ≈ b
+    # fidelity
+    @test fidelity(crho, crho) ≈ 1
+    @test measure(crho; nshots=2) isa Vector
+end
\ No newline at end of file
diff --git a/ext/CuYao/test/instructs.jl b/ext/CuYao/test/instructs.jl
new file mode 100644
index 00000000..300e779a
--- /dev/null
+++ b/ext/CuYao/test/instructs.jl
@@ -0,0 +1,135 @@
+using LinearAlgebra, Yao.ConstGate
+using Test, Random
+using CuYao
+using Yao.YaoArrayRegister.StaticArrays
+using Yao.ConstGate: SWAPGate
+using CUDA
+
+@testset "gpu instruct nbit!" begin
+    Random.seed!(3)
+    nbit = 6
+    N = 1<<nbit
+    LOC1 = SVector{2}([0, 1])
+    v1 = randn(ComplexF32, N)
+    vn = randn(ComplexF32, N, 333)
+
+    for UN in [
+            rand_unitary(ComplexF32, 4),
+            mat(ComplexF32, CNOT),
+            mat(ComplexF32, control(2,2,1=>Z)),
+            mat(ComplexF32, put(2,2=>I2)),
+            mat(ComplexF32, put(2,2=>P0))
+            ]
+
+        @test instruct!(Val(2),v1 |> CuArray, UN, (3,1)) |> Vector ≈ instruct!(Val(2),v1 |> copy, UN, (3,1))
+        @test instruct!(Val(2),vn |> CuArray, UN, (3,1)) |> Matrix ≈ instruct!(Val(2),vn |> copy, UN, (3,1))
+    end
+    # sparse matrix like P0, P1 et. al. are not implemented.
+    for g in [mat(ComplexF32, ConstGate.T), mat(ComplexF32, ConstGate.H), mat(ComplexF32, rot(Z, 0.5))]
+        @test instruct!(Val(2), v1 |> CuArray, g, (3,), (4,), (1,)) |> Vector ≈ instruct!(Val(2), v1 |> copy, g, (3,), (4,), (1,))
+        @test instruct!(Val(2), vn |> CuArray, g, (3,), (4,), (1,)) |> Matrix ≈ instruct!(Val(2), vn |> copy, g, (3,), (4,), (1,))
+        @test instruct!(Val(2), v1 |> CuArray, g, (3,), (4,), (1,)) |> Vector ≈ instruct!(Val(2), v1 |> copy, g, (3,), (4,), (1,))
+        @test instruct!(Val(2), vn |> CuArray, g, (3,), (4,), (1,)) |> Matrix ≈ instruct!(Val(2), vn |> copy, g, (3,), (4,), (1,))
+    end
+end
+
+@testset "gpu swapapply!" begin
+    nbit = 6
+    N = 1<<nbit
+    LOC1 = SVector{2}([0, 1])
+    v1 = randn(ComplexF32, N)
+    vn = randn(ComplexF32, N, 333)
+
+    @test instruct!(Val(2),v1 |> CuArray, Val(:SWAP), (3,5)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(:SWAP), (3,5))
+    @test instruct!(Val(2),vn |> CuArray, Val(:SWAP), (3,5)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(:SWAP), (3,5))
+end
+
+@testset "gpu instruct! 1bit" begin
+    nbit = 6
+    N = 1<<nbit
+    LOC1 = SVector{2}([0, 1])
+    v1 = randn(ComplexF64, N)
+    vn = randn(ComplexF64, N, 333)
+
+    for U1 in [mat(H), mat(Z), mat(I2), mat(ConstGate.P0), mat(X), mat(Y)]
+        @test instruct!(Val(2),v1 |> CuArray, U1, (3,)) |> Vector ≈ instruct!(Val(2),v1 |> copy, U1, (3,))
+        @test instruct!(Val(2),vn |> CuArray, U1, (3,)) |> Matrix ≈ instruct!(Val(2),vn |> copy, U1, (3,))
+    end
+    # sparse matrix like P0, P1 et. al. are not implemented.
+end
+
+@testset "gpu xyz-instruct!" begin
+    nbit = 6
+    N = 1<<nbit
+    LOC1 = SVector{2}([0, 1])
+    v1 = randn(ComplexF32, N)
+    vn = randn(ComplexF32, N, 333)
+
+    for G in [:X, :Y, :Z, :T, :H, :Tdag, :S, :Sdag]
+        @test instruct!(Val(2),v1 |> CuArray, Val(G), (3,)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G), (3,))
+        @test instruct!(Val(2),vn |> CuArray, Val(G), (3,)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G), (3,))
+        if G != :H
+            @test instruct!(Val(2),v1 |> CuArray, Val(G), (1,3,4)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G), (1,3,4))
+            @test instruct!(Val(2),vn |> CuArray,  Val(G),(1,3,4)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G), (1,3,4))
+        end
+    end
+end
+
+@testset "gpu cn-xyz-instruct!" begin
+    nbit = 6
+    N = 1<<nbit
+    LOC1 = SVector{2}([0, 1])
+    v1 = randn(ComplexF32, N)
+    vn = randn(ComplexF32, N, 333)
+
+    for G in [:X, :Y, :Z, :T, :Tdag, :S, :Sdag]
+        @test instruct!(Val(2),v1 |> CuArray, Val(G), (3,), (4,5), (0, 1)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G), (3,), (4,5), (0, 1))
+        @test instruct!(Val(2),vn |> CuArray, Val(G), (3,), (4,5), (0, 1)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G), (3,), (4,5), (0, 1))
+        @test instruct!(Val(2),v1 |> CuArray, Val(G), (3,), (1,), (1,)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G),(3,), (1,), (1,))
+        @test instruct!(Val(2),vn |> CuArray, Val(G), (3,), (1,), (1,)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G),(3,), (1,), (1,))
+    end
+end
+
+@testset "pswap" begin
+    ps = put(6, (2,4)=>rot(SWAP, π/2))
+    reg = rand_state(6; nbatch=10)
+    @test apply!(reg |> cu, ps) |> cpu ≈ apply!(copy(reg), ps)
+    @test apply!(reg |> cu, ps).state isa CuArray
+end
+
+@testset "regression test: Rx, Ry, Rz, CPHASE" begin
+    Random.seed!(3)
+    nbit = 6
+    for ps in [put(6, (2,)=>Rx(π/2)), put(6, 2=>Ry(0.5)),  put(6, 2=>Rz(0.4))]
+        reg = rand_state(6; nbatch=10)
+        @test apply!(reg |> cu, ps) |> cpu ≈ apply!(copy(reg), ps)
+        @test apply!(reg |> cu, ps).state isa CuArray
+    end
+end
+
+@testset "density matrix" begin
+    nbit = 6
+    reg = rand_state(nbit)
+    rho = density_matrix(reg)
+    c = put(6, (2,)=>Rx(π/3))
+    @test apply(rho |> cu, c) |> cpu ≈ apply(rho, c)
+end
+
+@testset "time evolve" begin
+    g = time_evolve(kron(10, 2=>X, 3=>X), 0.5)
+    reg = rand_state(10)
+    @test apply!(copy(reg), g) ≈ apply!(reg |> cu, g) |> cpu
+end
+
+# fix: https://github.com/QuantumBFS/CuYao.jl/issues/81
+@testset "generic sparse (pqc circuit)" begin
+    # kron of Rx
+    pqc_circuit = subroutine(10, kron(Rx(0.4), Rx(0.5), Rx(0.6), Rx(0.8)), (1, 2, 6, 5))
+    proxy_reg = zero_state(10)
+    @test apply!(proxy_reg |> cu, pqc_circuit) |> cpu ≈ apply(proxy_reg, pqc_circuit)
+
+    # kron of Rz
+    pqc_circuit = subroutine(10, kron(Rz(0.4), Rz(0.5), Rz(0.6), Rz(0.8)), (1, 2, 6, 5))
+    proxy_reg = zero_state(10)
+    @test apply!(proxy_reg |> cu, pqc_circuit) |> cpu ≈ apply(proxy_reg, pqc_circuit)
+end
\ No newline at end of file
diff --git a/ext/CuYao/test/register.jl b/ext/CuYao/test/register.jl
new file mode 100644
index 00000000..8c9551c0
--- /dev/null
+++ b/ext/CuYao/test/register.jl
@@ -0,0 +1,139 @@
+using Test
+using CuYao
+using CuYao: tri2ij
+using LinearAlgebra
+using Yao.YaoArrayRegister.BitBasis
+using Statistics: mean
+using Yao.YaoArrayRegister.StaticArrays
+using CUDA
+using Yao.YaoArrayRegister: batched_kron, batched_kron!, batch_normalize!
+CUDA.allowscalar(false)
+
+@testset "basics" begin
+    a = randn(ComplexF64, 50, 20)
+    ca = a|>cu
+    batch_normalize!(ca)
+    batch_normalize!(a)
+    @test ca |> Matrix ≈ a
+
+    for l = 1:100
+        i, j = tri2ij(l)
+        @test (i-1)*(i-2)÷2+j == l
+    end
+end
+
+@testset "constructor an measure" begin
+    reg = rand_state(10)
+    greg = reg |> cu
+    @test greg isa AbstractCuArrayReg
+    @test eltype(greg.state) == ComplexF64
+    myvec(x) = Vector(x)
+    myvec(x::Number) = [x]
+    for reg in [rand_state(10, nbatch=333), rand_state(10)]
+        greg = reg |> cu
+        @test size(measure(greg |> copy, nshots=10)) == size(measure(reg, nshots=10))
+        @test size(measure!(greg |> copy)) == size(measure!(reg |> copy))
+        @test size(measure!(ResetTo(0), greg |> copy)) == size(measure!(ResetTo(0), reg |> copy))
+        @test size(measure!(RemoveMeasured(), greg |> copy)) == size(measure!(RemoveMeasured(), reg |> copy))
+        @test select(greg |> copy, 12) |> cpu ≈ select(reg, 12)
+        @test size(measure!(greg |> copy |> focus!(3,4,1))) == size(measure!(reg |> copy |> focus!(3,4,1)))
+        @test greg |> copy |> focus!(3,4,1) |> relax!(3,4,1) |> cpu ≈ reg
+
+        if nbatch(greg) == 1
+            greg1 = greg |> copy |> focus!(1,4,3)
+            greg0 = copy(greg1)
+            res = measure!(RemoveMeasured(), greg1)
+            @test select(greg0, res |> myvec) |> normalize! |> cpu ≈ greg1 |> cpu
+        end
+
+        greg1 = greg |> copy |> focus!(1,4,3)
+        greg0 = copy(greg1)
+        res = measure!(ResetTo(3), greg1)
+        @test all(measure(greg1, nshots=10) .== 3)
+        @test greg1 |> isnormalized
+        @test all(select.(BatchedArrayReg(greg0 |> cpu), res |> myvec) .|> normalize! .≈ select.(BatchedArrayReg(greg1 |> cpu), 3))
+
+        greg1 = greg |> copy |> focus!(1,4,3)
+        greg0 = copy(greg1)
+        res = measure!(greg1)
+        @test all(select.(BatchedArrayReg(greg0 |> cpu), res |> myvec) .|> normalize! .≈ select.(BatchedArrayReg(greg1 |> cpu), res|>myvec))
+    end
+
+    @test join(rand_state(3) |> cu, rand_state(3) |> cu) |> nactive == 6
+    @test join(rand_state(3, nbatch=10) |> cu, rand_state(3, nbatch=10) |> cu) |> nactive == 6
+end
+
+@testset "insert_qubits!" begin
+    reg = rand_state(5; nbatch=10)
+    res = insert_qudits!(reg |> cu, 3, 2) |> cpu
+    @test insert_qudits!(reg, 3, 2) ≈ res
+    @test append_qudits!(copy(reg) |> cu, 3) |> cpu ≈ append_qudits!(copy(reg), 3)
+
+    reg = rand_state(5, nbatch=10) |>focus!(2,3)
+    res = insert_qudits!(reg |> cu, 3, 2) |> cpu
+    @test insert_qudits!(reg, 3, 2) ≈ res
+
+    @test append_qudits!(copy(reg) |> cu, 3) |> cpu ≈ append_qudits!(copy(reg), 3)
+end
+
+@testset "cuda-op-measures" begin
+    reg = rand_state(8; nbatch=32) |> cu
+    op = repeat(5, X, 1:5)
+
+    # measure!
+    reg2 = reg |> copy
+    res = measure!(op, reg2, 2:6)
+    res2 = measure!(op, reg2, 2:6)
+    @test size(res) == (32,)
+    @test res2 == res
+
+    reg = clone(ArrayReg([1,-1+0im]/sqrt(2.0)), 10) |> cu
+    @test measure!(X, reg) |> mean ≈ -1
+    reg = clone(ArrayReg([1.0,0+0im]), 1000)
+    @test abs(measure!(X, reg) |> mean) < 0.1
+end
+
+@testset "cuda kron getindex" begin
+    a = randn(3,4)
+    b = randn(4,2)
+    c = zeros(12,8)
+    ca, cb, cc = cu(a), cu(b), cu(c)
+    @test kron(ca, cb) |> Array ≈ kron(a, b)
+    @test Yao.YaoArrayRegister.kron!(cc, ca, cb) |> Array ≈ kron(a,b)
+
+    Yao.YaoArrayRegister.kron!(c,a,b)
+    @test cc |> Array ≈ c
+
+    v = randn(100) |> cu
+    inds = [3,5,2,1,7,1]
+    @test v[inds] ≈ v[inds |> CuVector]
+end
+
+@testset "cuda batched_kron" begin
+    a = randn(3,4,5)
+    b = randn(4,2,5)
+    c = zeros(12,8,5)
+    ca, cb, cc = cu(a), cu(b), cu(c)
+
+    @test batched_kron(ca, cb) |> Array ≈ batched_kron(a, b)
+    @test batched_kron!(cc, ca, cb) |> Array ≈ batched_kron(a, b)
+
+    batched_kron!(c, a, b)
+    @test cc |> Array ≈ c
+end
+
+@testset "zero_state, et al" begin
+    for b = [4, NoBatch()]
+        reg = cuzero_state(3; nbatch=b)
+        @test cpu(reg) ≈ zero_state(3; nbatch=b) && reg isa AbstractCuArrayReg
+        reg = curand_state(3; nbatch=b)
+        @test reg isa AbstractCuArrayReg
+        reg = cuuniform_state(3; nbatch=b)
+        @test cpu(reg) ≈ uniform_state(3; nbatch=b) && reg isa AbstractCuArrayReg
+        reg = cuproduct_state(bit"110"; nbatch=b)
+        @test cpu(reg) ≈ product_state(bit"110"; nbatch=b) && reg isa AbstractCuArrayReg
+        reg = cughz_state(3; nbatch=b)
+        @test cpu(reg) ≈ ghz_state(3; nbatch=b) && reg isa AbstractCuArrayReg
+    end
+end
+
diff --git a/ext/CuYao/test/runtests.jl b/ext/CuYao/test/runtests.jl
new file mode 100644
index 00000000..77df6514
--- /dev/null
+++ b/ext/CuYao/test/runtests.jl
@@ -0,0 +1,18 @@
+using CUDA, CuYao, Test
+CUDA.allowscalar(false)
+
+@testset "CUDA patch" begin
+    include("CUDApatch.jl")
+end
+
+@testset "GPU reg" begin
+    include("register.jl")
+end
+
+@testset "gpu applies" begin
+    include("instructs.jl")
+end
+
+@testset "extra" begin
+    include("extra.jl")
+end
\ No newline at end of file
diff --git a/lib/YaoPlots/src/vizcircuit.jl b/lib/YaoPlots/src/vizcircuit.jl
index 9e5478f9..d4d28cb9 100644
--- a/lib/YaoPlots/src/vizcircuit.jl
+++ b/lib/YaoPlots/src/vizcircuit.jl
@@ -367,13 +367,13 @@ for B in [:LabelBlock, :GeneralMatrixBlock, :Add]
         _draw!(c, [controls..., (address, c.gatestyles.g, string(cb))])
     end
 end
-for GT in [:KronBlock, :RepeatedBlock, :CachedBlock, :Subroutine, :(YaoBlocks.AD.NoParams)]
-    @eval function draw!(c::CircuitGrid, p::$GT, address, controls)
-        barrier_style = CircuitStyles.barrier_for_chain[]
-        CircuitStyles.barrier_for_chain[] = false
-        draw!(c, YaoBlocks.Optimise.to_basictypes(p), address, controls)
-        CircuitStyles.barrier_for_chain[] = barrier_style
-    end
+
+# [:KronBlock, :RepeatedBlock, :CachedBlock, :Subroutine, :(YaoBlocks.AD.NoParams)]
+function draw!(c::CircuitGrid, p::CompositeBlock, address, controls)
+    barrier_style = CircuitStyles.barrier_for_chain[]
+    CircuitStyles.barrier_for_chain[] = false
+    draw!(c, YaoBlocks.Optimise.to_basictypes(p), address, controls)
+    CircuitStyles.barrier_for_chain[] = barrier_style
 end
 for (GATE, SYM) in [(:XGate, :Rx), (:YGate, :Ry), (:ZGate, :Rz)]
     @eval get_brush_texts(c, b::RotationGate{D,T,<:$GATE}) where {D,T} = (c.gatestyles.g, "$($(SYM))($(pretty_angle(b.theta)))")

From 3a93015f34739527a3227db882d08c823f73c736 Mon Sep 17 00:00:00 2001
From: GiggleLiu <cacate0129@gmail.com>
Date: Sat, 24 Feb 2024 17:37:01 +0000
Subject: [PATCH 2/6] tests pass

---
 Project.toml                      |  1 -
 ext/CuYao/src/CuYao.jl            |  3 +--
 ext/CuYao/test/CUDApatch.jl       |  2 +-
 ext/CuYao/{ => test}/Project.toml |  4 ----
 ext/CuYao/test/extra.jl           |  4 ++--
 ext/CuYao/test/instructs.jl       |  4 ++--
 ext/CuYao/test/register.jl        | 18 +++++++++---------
 ext/CuYao/test/runtests.jl        |  4 ++--
 src/Yao.jl                        |  7 ++++++-
 9 files changed, 23 insertions(+), 24 deletions(-)
 rename ext/CuYao/{ => test}/Project.toml (88%)

diff --git a/Project.toml b/Project.toml
index 84787249..4faa42d5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,7 +4,6 @@ version = "0.8.13"
 
 [deps]
 BitBasis = "50ba71b6-fa0f-514d-ae9a-0916efc90dcf"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LuxurySparse = "d05aeea4-b7d4-55ac-b691-9e7fabb07ba2"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
diff --git a/ext/CuYao/src/CuYao.jl b/ext/CuYao/src/CuYao.jl
index cc64a45e..a04c2417 100644
--- a/ext/CuYao/src/CuYao.jl
+++ b/ext/CuYao/src/CuYao.jl
@@ -24,8 +24,7 @@ import Yao.YaoArrayRegister: u1rows!, unrows!, autostatic, instruct!, swaprows!,
 import Yao.YaoArrayRegister.LinearAlgebra: norm
 import Base: kron, getindex
 
-export cpu, cu, AbstractCuArrayReg, CuArrayReg, CuBatchedArrayReg, CuDensityMatrix,
-    cuzero_state, cuuniform_state, curand_state, cuproduct_state, cughz_state
+import Yao: cpu, cuzero_state, cuuniform_state, curand_state, cuproduct_state, cughz_state
 
 const Ints = NTuple{<:Any, Int}
 
diff --git a/ext/CuYao/test/CUDApatch.jl b/ext/CuYao/test/CUDApatch.jl
index b00adb96..136d7aeb 100644
--- a/ext/CuYao/test/CUDApatch.jl
+++ b/ext/CuYao/test/CUDApatch.jl
@@ -1,4 +1,4 @@
-using CuYao
+using Yao
 using CUDA
 using Test
 using Yao.YaoBlocks
diff --git a/ext/CuYao/Project.toml b/ext/CuYao/test/Project.toml
similarity index 88%
rename from ext/CuYao/Project.toml
rename to ext/CuYao/test/Project.toml
index 88d577a0..2e5f158c 100644
--- a/ext/CuYao/Project.toml
+++ b/ext/CuYao/test/Project.toml
@@ -1,7 +1,3 @@
-name = "CuYao"
-uuid = "b48ca7a8-dd42-11e8-2b8e-1b7706800275"
-version = "0.3.9"
-
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/ext/CuYao/test/extra.jl b/ext/CuYao/test/extra.jl
index 739ee6cb..95e759ce 100644
--- a/ext/CuYao/test/extra.jl
+++ b/ext/CuYao/test/extra.jl
@@ -1,4 +1,4 @@
-using CuYao, Test, CUDA
+using Yao, Test, CUDA
 CUDA.allowscalar(false)
 
 @testset "gradient" begin
@@ -49,4 +49,4 @@ end
     # fidelity
     @test fidelity(crho, crho) ≈ 1
     @test measure(crho; nshots=2) isa Vector
-end
\ No newline at end of file
+end
diff --git a/ext/CuYao/test/instructs.jl b/ext/CuYao/test/instructs.jl
index 300e779a..48ae4c69 100644
--- a/ext/CuYao/test/instructs.jl
+++ b/ext/CuYao/test/instructs.jl
@@ -1,6 +1,6 @@
 using LinearAlgebra, Yao.ConstGate
 using Test, Random
-using CuYao
+using Yao
 using Yao.YaoArrayRegister.StaticArrays
 using Yao.ConstGate: SWAPGate
 using CUDA
@@ -132,4 +132,4 @@ end
     pqc_circuit = subroutine(10, kron(Rz(0.4), Rz(0.5), Rz(0.6), Rz(0.8)), (1, 2, 6, 5))
     proxy_reg = zero_state(10)
     @test apply!(proxy_reg |> cu, pqc_circuit) |> cpu ≈ apply(proxy_reg, pqc_circuit)
-end
\ No newline at end of file
+end
diff --git a/ext/CuYao/test/register.jl b/ext/CuYao/test/register.jl
index 8c9551c0..8c6dc1d6 100644
--- a/ext/CuYao/test/register.jl
+++ b/ext/CuYao/test/register.jl
@@ -1,6 +1,6 @@
 using Test
-using CuYao
-using CuYao: tri2ij
+using Yao
+const CuYao = Base.get_extension(Yao, :CuYao)
 using LinearAlgebra
 using Yao.YaoArrayRegister.BitBasis
 using Statistics: mean
@@ -17,7 +17,7 @@ CUDA.allowscalar(false)
     @test ca |> Matrix ≈ a
 
     for l = 1:100
-        i, j = tri2ij(l)
+        i, j = CuYao.tri2ij(l)
         @test (i-1)*(i-2)÷2+j == l
     end
 end
@@ -25,7 +25,7 @@ end
 @testset "constructor an measure" begin
     reg = rand_state(10)
     greg = reg |> cu
-    @test greg isa AbstractCuArrayReg
+    @test greg isa CuYao.AbstractCuArrayReg
     @test eltype(greg.state) == ComplexF64
     myvec(x) = Vector(x)
     myvec(x::Number) = [x]
@@ -125,15 +125,15 @@ end
 @testset "zero_state, et al" begin
     for b = [4, NoBatch()]
         reg = cuzero_state(3; nbatch=b)
-        @test cpu(reg) ≈ zero_state(3; nbatch=b) && reg isa AbstractCuArrayReg
+        @test cpu(reg) ≈ zero_state(3; nbatch=b) && reg isa CuYao.AbstractCuArrayReg
         reg = curand_state(3; nbatch=b)
-        @test reg isa AbstractCuArrayReg
+        @test reg isa CuYao.AbstractCuArrayReg
         reg = cuuniform_state(3; nbatch=b)
-        @test cpu(reg) ≈ uniform_state(3; nbatch=b) && reg isa AbstractCuArrayReg
+        @test cpu(reg) ≈ uniform_state(3; nbatch=b) && reg isa CuYao.AbstractCuArrayReg
         reg = cuproduct_state(bit"110"; nbatch=b)
-        @test cpu(reg) ≈ product_state(bit"110"; nbatch=b) && reg isa AbstractCuArrayReg
+        @test cpu(reg) ≈ product_state(bit"110"; nbatch=b) && reg isa CuYao.AbstractCuArrayReg
         reg = cughz_state(3; nbatch=b)
-        @test cpu(reg) ≈ ghz_state(3; nbatch=b) && reg isa AbstractCuArrayReg
+        @test cpu(reg) ≈ ghz_state(3; nbatch=b) && reg isa CuYao.AbstractCuArrayReg
     end
 end
 
diff --git a/ext/CuYao/test/runtests.jl b/ext/CuYao/test/runtests.jl
index 77df6514..420a735c 100644
--- a/ext/CuYao/test/runtests.jl
+++ b/ext/CuYao/test/runtests.jl
@@ -1,4 +1,4 @@
-using CUDA, CuYao, Test
+using CUDA, Yao, Test
 CUDA.allowscalar(false)
 
 @testset "CUDA patch" begin
@@ -15,4 +15,4 @@ end
 
 @testset "extra" begin
     include("extra.jl")
-end
\ No newline at end of file
+end
diff --git a/src/Yao.jl b/src/Yao.jl
index a42710b0..7b1deac1 100644
--- a/src/Yao.jl
+++ b/src/Yao.jl
@@ -16,7 +16,6 @@ const 幺 = Yao
 
 using Reexport
 @reexport using YaoArrayRegister, YaoBlocks, YaoSym, YaoPlots
-export EasyBuild
 using YaoArrayRegister.BitBasis, YaoAPI
 
 using YaoBlocks:
@@ -28,6 +27,12 @@ using YaoBlocks:
     print_title,
     print_block
 
+export EasyBuild
+# CUDA APIs
+for FT in [:cpu, :cuzero_state, :cuuniform_state, :curand_state, :cuproduct_state, :cughz_state]
+    @eval export $FT
+    @eval function $FT end
+end
 
 include("deprecations.jl")
 include("EasyBuild/easybuild.jl")

From 1154106bb03079365d7f03a451f7a6d351f44d8e Mon Sep 17 00:00:00 2001
From: GiggleLiu <cacate0129@gmail.com>
Date: Sun, 25 Feb 2024 02:37:34 +0800
Subject: [PATCH 3/6] add makefile

---
 Makefile          | 36 ++++++++++++++++++++++++++++++++++++
 docs/Project.toml |  1 +
 2 files changed, 37 insertions(+)
 create mode 100644 Makefile

diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..93f0a1b3
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,36 @@
+JL = julia --project
+
+default: init test
+
+init:
+	$(JL) -e 'using Pkg; Pkg.develop([Pkg.PackageSpec(path = joinpath("lib", pkg)) for pkg in ["YaoArrayRegister", "YaoBlocks", "YaoSym", "YaoPlots", "YaoAPI"]]); Pkg.precompile()'
+init-docs:
+	$(JL) -e 'using Pkg; Pkg.activate("docs"); Pkg.develop([Pkg.PackageSpec(path = joinpath("lib", pkg)) for pkg in ["YaoArrayRegister", "YaoBlocks", "YaoSym", "YaoPlots", "YaoAPI"]]); Pkg.precompile()'
+
+update:
+	$(JL) -e 'using Pkg; Pkg.update(); Pkg.precompile()'
+update-docs:
+	$(JL) -e 'using Pkg; Pkg.activate("docs"); Pkg.update(); Pkg.precompile()'
+
+test-Yao:
+	$(JL) -e 'using Pkg; Pkg.test("Yao")'
+test-CuYao:
+	$(JL) -e 'using Pkg; Pkg.activate("ext/CuYao/test"); Pkg.develop(path="."); Pkg.update()'
+	$(JL) -e 'using Pkg; Pkg.activate("ext/CuYao/test"); include("ext/CuYao/test/runtests.jl")'
+
+test-%:
+	$(JL) -e 'using Pkg; Pkg.test("$*")'
+
+test:
+	$(JL) -e 'using Pkg; Pkg.test(["YaoAPI", "YaoArrayRegister", "YaoBlocks", "YaoSym", "YaoPlots", "Yao"])'
+
+coverage:
+	$(JL) -e 'using Pkg; Pkg.test(["YaoAPI", "YaoArrayRegister", "YaoBlocks", "YaoSym", "YaoPlots", "Yao"]; coverage=true)'
+
+servedocs:
+	$(JL) -e 'using Pkg; Pkg.activate("docs"); using LiveServer; servedocs(;skip_dirs=["docs/src/assets", "docs/src/generated"])'
+
+clean:
+	rm -rf docs/build
+
+.PHONY: init test
diff --git a/docs/Project.toml b/docs/Project.toml
index d6d7fde0..d0ab5651 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -7,6 +7,7 @@ FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
 Latexify = "23fbe1c1-3f47-55db-b15f-69d7ec21a316"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
+LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"

From c9a24ba97278dfd1f7f0110bdef075a07da67215 Mon Sep 17 00:00:00 2001
From: GiggleLiu <cacate0129@gmail.com>
Date: Sat, 24 Feb 2024 18:50:04 +0000
Subject: [PATCH 4/6] update make file

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 93f0a1b3..45e53f2d 100644
--- a/Makefile
+++ b/Makefile
@@ -32,5 +32,6 @@ servedocs:
 
 clean:
 	rm -rf docs/build
+	find . -name "*.cov" -type f -print0 | xargs -0 /bin/rm -f
 
 .PHONY: init test

From 1550f32b7c22333b508db03537e949adab40ad82 Mon Sep 17 00:00:00 2001
From: GiggleLiu <cacate0129@gmail.com>
Date: Mon, 26 Feb 2024 02:26:19 +0800
Subject: [PATCH 5/6] rm CuYao LICENSE

---
 ext/CuYao/LICENSE.md | 217 -------------------------------------------
 1 file changed, 217 deletions(-)
 delete mode 100644 ext/CuYao/LICENSE.md

diff --git a/ext/CuYao/LICENSE.md b/ext/CuYao/LICENSE.md
deleted file mode 100644
index 8373be63..00000000
--- a/ext/CuYao/LICENSE.md
+++ /dev/null
@@ -1,217 +0,0 @@
-The CuYao.jl package is licensed under the Apache License, Version 2.0:
-
-> COPYRIGHT
->
-> All contributions by Jinguo Liu
-> Copyright (c) 2018: Jinguo Liu
-> All rights reserved
->
-> All contributions by Xiuzhe Luo
-> Copyright (c) 2018: Xiuzhe Luo.
-> All rights reserved.
->
-> Each contributor holds copyright over their respective contributions.
-> The project versioning (Git) records all such contribution source information.
->
->                                  Apache License
->                            Version 2.0, January 2004
->                         http://www.apache.org/licenses/
->
->    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
->
->    1. Definitions.
->
->       "License" shall mean the terms and conditions for use, reproduction,
->       and distribution as defined by Sections 1 through 9 of this document.
->
->       "Licensor" shall mean the copyright owner or entity authorized by
->       the copyright owner that is granting the License.
->
->       "Legal Entity" shall mean the union of the acting entity and all
->       other entities that control, are controlled by, or are under common
->       control with that entity. For the purposes of this definition,
->       "control" means (i) the power, direct or indirect, to cause the
->       direction or management of such entity, whether by contract or
->       otherwise, or (ii) ownership of fifty percent (50%) or more of the
->       outstanding shares, or (iii) beneficial ownership of such entity.
->
->       "You" (or "Your") shall mean an individual or Legal Entity
->       exercising permissions granted by this License.
->
->       "Source" form shall mean the preferred form for making modifications,
->       including but not limited to software source code, documentation
->       source, and configuration files.
->
->       "Object" form shall mean any form resulting from mechanical
->       transformation or translation of a Source form, including but
->       not limited to compiled object code, generated documentation,
->       and conversions to other media types.
->
->       "Work" shall mean the work of authorship, whether in Source or
->       Object form, made available under the License, as indicated by a
->       copyright notice that is included in or attached to the work
->       (an example is provided in the Appendix below).
->
->       "Derivative Works" shall mean any work, whether in Source or Object
->       form, that is based on (or derived from) the Work and for which the
->       editorial revisions, annotations, elaborations, or other modifications
->       represent, as a whole, an original work of authorship. For the purposes
->       of this License, Derivative Works shall not include works that remain
->       separable from, or merely link (or bind by name) to the interfaces of,
->       the Work and Derivative Works thereof.
->
->       "Contribution" shall mean any work of authorship, including
->       the original version of the Work and any modifications or additions
->       to that Work or Derivative Works thereof, that is intentionally
->       submitted to Licensor for inclusion in the Work by the copyright owner
->       or by an individual or Legal Entity authorized to submit on behalf of
->       the copyright owner. For the purposes of this definition, "submitted"
->       means any form of electronic, verbal, or written communication sent
->       to the Licensor or its representatives, including but not limited to
->       communication on electronic mailing lists, source code control systems,
->       and issue tracking systems that are managed by, or on behalf of, the
->       Licensor for the purpose of discussing and improving the Work, but
->       excluding communication that is conspicuously marked or otherwise
->       designated in writing by the copyright owner as "Not a Contribution."
->
->       "Contributor" shall mean Licensor and any individual or Legal Entity
->       on behalf of whom a Contribution has been received by Licensor and
->       subsequently incorporated within the Work.
->
->    2. Grant of Copyright License. Subject to the terms and conditions of
->       this License, each Contributor hereby grants to You a perpetual,
->       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
->       copyright license to reproduce, prepare Derivative Works of,
->       publicly display, publicly perform, sublicense, and distribute the
->       Work and such Derivative Works in Source or Object form.
->
->    3. Grant of Patent License. Subject to the terms and conditions of
->       this License, each Contributor hereby grants to You a perpetual,
->       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
->       (except as stated in this section) patent license to make, have made,
->       use, offer to sell, sell, import, and otherwise transfer the Work,
->       where such license applies only to those patent claims licensable
->       by such Contributor that are necessarily infringed by their
->       Contribution(s) alone or by combination of their Contribution(s)
->       with the Work to which such Contribution(s) was submitted. If You
->       institute patent litigation against any entity (including a
->       cross-claim or counterclaim in a lawsuit) alleging that the Work
->       or a Contribution incorporated within the Work constitutes direct
->       or contributory patent infringement, then any patent licenses
->       granted to You under this License for that Work shall terminate
->       as of the date such litigation is filed.
->
->    4. Redistribution. You may reproduce and distribute copies of the
->       Work or Derivative Works thereof in any medium, with or without
->       modifications, and in Source or Object form, provided that You
->       meet the following conditions:
->
->       (a) You must give any other recipients of the Work or
->           Derivative Works a copy of this License; and
->
->       (b) You must cause any modified files to carry prominent notices
->           stating that You changed the files; and
->
->       (c) You must retain, in the Source form of any Derivative Works
->           that You distribute, all copyright, patent, trademark, and
->           attribution notices from the Source form of the Work,
->           excluding those notices that do not pertain to any part of
->           the Derivative Works; and
->
->       (d) If the Work includes a "NOTICE" text file as part of its
->           distribution, then any Derivative Works that You distribute must
->           include a readable copy of the attribution notices contained
->           within such NOTICE file, excluding those notices that do not
->           pertain to any part of the Derivative Works, in at least one
->           of the following places: within a NOTICE text file distributed
->           as part of the Derivative Works; within the Source form or
->           documentation, if provided along with the Derivative Works; or,
->           within a display generated by the Derivative Works, if and
->           wherever such third-party notices normally appear. The contents
->           of the NOTICE file are for informational purposes only and
->           do not modify the License. You may add Your own attribution
->           notices within Derivative Works that You distribute, alongside
->           or as an addendum to the NOTICE text from the Work, provided
->           that such additional attribution notices cannot be construed
->           as modifying the License.
->
->       You may add Your own copyright statement to Your modifications and
->       may provide additional or different license terms and conditions
->       for use, reproduction, or distribution of Your modifications, or
->       for any such Derivative Works as a whole, provided Your use,
->       reproduction, and distribution of the Work otherwise complies with
->       the conditions stated in this License.
->
->    5. Submission of Contributions. Unless You explicitly state otherwise,
->       any Contribution intentionally submitted for inclusion in the Work
->       by You to the Licensor shall be under the terms and conditions of
->       this License, without any additional terms or conditions.
->       Notwithstanding the above, nothing herein shall supersede or modify
->       the terms of any separate license agreement you may have executed
->       with Licensor regarding such Contributions.
->
->    6. Trademarks. This License does not grant permission to use the trade
->       names, trademarks, service marks, or product names of the Licensor,
->       except as required for reasonable and customary use in describing the
->       origin of the Work and reproducing the content of the NOTICE file.
->
->    7. Disclaimer of Warranty. Unless required by applicable law or
->       agreed to in writing, Licensor provides the Work (and each
->       Contributor provides its Contributions) on an "AS IS" BASIS,
->       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
->       implied, including, without limitation, any warranties or conditions
->       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
->       PARTICULAR PURPOSE. You are solely responsible for determining the
->       appropriateness of using or redistributing the Work and assume any
->       risks associated with Your exercise of permissions under this License.
->
->    8. Limitation of Liability. In no event and under no legal theory,
->       whether in tort (including negligence), contract, or otherwise,
->       unless required by applicable law (such as deliberate and grossly
->       negligent acts) or agreed to in writing, shall any Contributor be
->       liable to You for damages, including any direct, indirect, special,
->       incidental, or consequential damages of any character arising as a
->       result of this License or out of the use or inability to use the
->       Work (including but not limited to damages for loss of goodwill,
->       work stoppage, computer failure or malfunction, or any and all
->       other commercial damages or losses), even if such Contributor
->       has been advised of the possibility of such damages.
->
->    9. Accepting Warranty or Additional Liability. While redistributing
->       the Work or Derivative Works thereof, You may choose to offer,
->       and charge a fee for, acceptance of support, warranty, indemnity,
->       or other liability obligations and/or rights consistent with this
->       License. However, in accepting such obligations, You may act only
->       on Your own behalf and on Your sole responsibility, not on behalf
->       of any other Contributor, and only if You agree to indemnify,
->       defend, and hold each Contributor harmless for any liability
->       incurred by, or claims asserted against, such Contributor by reason
->       of your accepting any such warranty or additional liability.
->
->    END OF TERMS AND CONDITIONS
->
->    APPENDIX: How to apply the Apache License to your work.
->
->       To apply the Apache License to your work, attach the following
->       boilerplate notice, with the fields enclosed by brackets "{}"
->       replaced with your own identifying information. (Don't include
->       the brackets!)  The text should be enclosed in the appropriate
->       comment syntax for the file format. We also recommend that a
->       file or class name and description of purpose be included on the
->       same "printed page" as the copyright notice for easier
->       identification within third-party archives.
->
->    Copyright [year] [fullname]
->
->    Licensed under the Apache License, Version 2.0 (the "License");
->    you may not use this file except in compliance with the License.
->    You may obtain a copy of the License at
->
->        http://www.apache.org/licenses/LICENSE-2.0
->
->    Unless required by applicable law or agreed to in writing, software
->    distributed under the License is distributed on an "AS IS" BASIS,
->    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
->    See the License for the specific language governing permissions and
->    limitations under the License.
->

From ac15ea8696ee54e905cdc6fe26f0f7cfaca0883a Mon Sep 17 00:00:00 2001
From: GiggleLiu <cacate0129@gmail.com>
Date: Thu, 29 Feb 2024 02:30:04 +0800
Subject: [PATCH 6/6] add CuYao and YaoPlots doc

---
 docs/make.jl              |  1 +
 docs/src/man/cuda.md      | 71 ++++++++++++++++++++++++++++++
 docs/src/man/plot.md      | 54 ++++++++++++++++++++++-
 ext/CuYao/README.md       | 90 +++------------------------------------
 ext/CuYao/src/register.jl | 20 ---------
 src/Yao.jl                |  8 ++--
 src/cudainterfaces.jl     | 41 ++++++++++++++++++
 7 files changed, 175 insertions(+), 110 deletions(-)
 create mode 100644 docs/src/man/cuda.md
 create mode 100644 src/cudainterfaces.jl

diff --git a/docs/make.jl b/docs/make.jl
index f67cad1a..9a549392 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -58,6 +58,7 @@ const PAGES = [
         "man/registers.md",
         "man/blocks.md",
         "man/symbolic.md",
+        "man/cuda.md",
         "man/plot.md",
         "man/automatic_differentiation.md",
         "man/simplification.md",
diff --git a/docs/src/man/cuda.md b/docs/src/man/cuda.md
new file mode 100644
index 00000000..98454f1b
--- /dev/null
+++ b/docs/src/man/cuda.md
@@ -0,0 +1,71 @@
+```@meta
+CurrentModule = Yao
+```
+
+# CUDA extension - CuYao
+
+## Tutorial
+`CuYao` is a CUDA extension of Yao, which allows you to run Yao circuits on GPU. The usage of `CuYao` is similar to `Yao`, but with some extra APIs to upload and download registers between CPU and GPU:
+- `cu(reg)` to upload a registe `reg` to GPU, and
+- `cpu(cureg)` to download a register `cureg` from GPU to CPU.
+
+```julia
+julia> using Yao, CUDA
+
+# create a register on GPU
+julia> cureg = rand_state(9; nbatch=1000) |> cu;   # or `curand_state(9; nbatch=1000)`.
+
+# run a circuit on GPU
+julia> cureg |> put(9, 2=>Z);
+
+# measure the register on GPU
+julia> measure!(cureg)
+1000-element CuArray{DitStr{2, 9, Int64}, 1, CUDA.Mem.DeviceBuffer}:
+ 110110100 ₍₂₎
+ 000100001 ₍₂₎
+ 111111001 ₍₂₎
+             ⋮
+ 010001101 ₍₂₎
+ 000100110 ₍₂₎
+
+# download the register to CPU
+julia> reg = cureg |> cpu;
+```
+
+
+## Features
+Supported gates:
+
+- general U(N) gate
+- general U(1) gate
+- X, Y, Z gate
+- T, S gate
+- SWAP gate
+- control gates
+
+Supported register operations:
+
+- measure!, measure_reset!, measure_remove!, select
+- append_qudits!, append_qubits!
+- insert_qudit!, insert_qubits!
+- focus!, relax!
+- join
+- density_matrix
+- fidelity
+- expect
+
+Autodiff:
+- autodiff is supported when the only parameterized gates are rotation gates in a circuit.
+
+## API
+```@docs
+cpu
+curand_state
+cuzero_state
+cuproduct_state
+cuuniform_state
+cughz_state
+```
+
+!!! note
+    the `cu` function is not documented in this module, but it is used to upload a register to GPU.
\ No newline at end of file
diff --git a/docs/src/man/plot.md b/docs/src/man/plot.md
index 7c0a3d6e..7e1e9160 100644
--- a/docs/src/man/plot.md
+++ b/docs/src/man/plot.md
@@ -11,7 +11,59 @@ end
 
 # Quantum Circuit Visualization
 
-Quantum circuit visualization support for Yao.
+`YaoPlots` is the Quantum circuit visualization component for Yao.
+
+
+## Examples
+#### Example 1: Visualize a QBIR define in Yao
+
+```@example plot
+using Yao.EasyBuild, YaoPlots
+
+# show a qft circuit
+vizcircuit(qft_circuit(5))
+```
+
+If you are using a Pluto/Jupyter notebook, Atom/VSCode editor, you should see the above image in your plotting panel.
+
+#### Example 2: Visualize a single qubit
+```@example plot
+using YaoPlots, Yao
+
+reg = zero_state(1) |> Rx(π/8) |> Rx(π/8)
+rho = density_matrix(ghz_state(2), 1)
+
+bloch_sphere("|ψ⟩"=>reg, "ρ"=>rho; show_projection_lines=true)
+```
+
+See more [examples](lib/YaoPlots/examples/circuits.jl).
+
+### Adjusting the plot attributes
+
+Various attributes of the visualizations can be altered. 
+The plot can be modified, if we change the following attributes
+
+- `YaoPlots.CircuitStyles.linecolor[]` for line color, default value being `"#000000"` (black color)
+- `YaoPlots.CircuitStyles.gate_bgcolor[]` for background color of square blocks, the default value being `"#FFFFFF"` (white color)
+- `YaoPlots.CircuitStyles.textcolor[]` for text color, default value being `"#000000`
+- `YaoPlots.CircuitStyles.lw[]` for line width, default value being `1` (pt)
+- `YaoPlots.CircuitStyles.textsize[]` for text size, default value being `16` (pt)
+- `YaoPlots.CircuitStyles.paramtextsize[]` for parameter text size, for parameterized gates, default value being `10` (pt)
+
+For example,
+
+```@example plot
+using YaoPlots, Yao
+YaoPlots.CircuitStyles.linecolor[] = "pink" 
+YaoPlots.CircuitStyles.gate_bgcolor[] = "yellow" 
+YaoPlots.CircuitStyles.textcolor[] = "#000080" # the navy blue color
+YaoPlots.CircuitStyles.fontfamily[] = "JuliaMono"
+YaoPlots.CircuitStyles.lw[] = 2.5
+YaoPlots.CircuitStyles.textsize[] = 13
+YaoPlots.CircuitStyles.paramtextsize[] = 8
+		
+vizcircuit(chain(3, put(1=>X), repeat(3, H), put(2=>Y), repeat(3, Rx(π/2))))
+```
 
 ## Circuit Visualization
 ```@docs
diff --git a/ext/CuYao/README.md b/ext/CuYao/README.md
index 4184db5c..becb9632 100644
--- a/ext/CuYao/README.md
+++ b/ext/CuYao/README.md
@@ -1,86 +1,8 @@
-**Build status**: [![][gitlab-img]][gitlab-url]
-
-[gitlab-img]: https://gitlab.com/JuliaGPU/CuYao.jl/badges/master/pipeline.svg
-[gitlab-url]: https://gitlab.com/JuliaGPU/CuYao.jl/pipelines
-
-CUDA support for [Yao.jl](https://github.com/QuantumBFS/Yao.jl).
-
-**Only tested locally, expect some adventures and rough edges.**
-
-## Installation
-
-<p>
-CuYao is a &nbsp;
-    <a href="https://julialang.org">
-        <img src="https://julialang.org/favicon.ico" width="16em">
-        Julia Language
-    </a>
-    &nbsp; package. It provides CUDA support for <a href="https://github.com/QuantumBFS/Yao.jl">Yao.jl</a>. To install CuYao,
-    please <a href="https://docs.julialang.org/en/v1/manual/getting-started/">open
-    Julia's interactive session (known as REPL)</a> and press <kbd>]</kbd> key in the REPL to use the package mode, then type the following command
-</p>
-
-For stable release
-
-```julia
-pkg> add CuYao
+## How to run tests
+    
+Open a terminal, switch to Yao folder and run the following commands:
+```bash
+make init-CuYao
+make test-CuYao
 ```
 
-For current master
-
-```julia
-pkg> add CuYao#master
-```
-
-You don't need to install Yao if you have `CuYao` installed. They share the same API except CUDA backend.
-
-## Documentation
-
-For documentation of [Yao](https://github.com/QuantumBFS/Yao.jl), please refer to [documentation (stable)](https://quantumbfs.github.io/Yao.jl/stable).
-
-`CuYao.jl` provides only two extra APIs, `reg |> cu` to upload a register to GPU, and `cureg |> cpu` to download a register to CPU.
-
-To start, see the following example
-```julia
-using CuYao
-
-cureg = rand_state(9; nbatch=1000) |> cu   # or `curand_state(9; nbatch=1000)`.
-cureg |> put(9, 2=>Z)
-measure!(cureg |> append_qubits!(1) |> focus!(4,1,3))
-cureg |> relax!(4,1,3) |> cpu
-```
-
-Constructors `curand_state`, `cuzero_state`, `cuproduct_state`, `cuuniform_state` and `cughz_state` are tailored for GPU,
-they are faster than uploading a CPU register to CPU.
-
-## Features
-### Supported Gates
-
-- general U(N) gate
-- general U(1) gate
-- better X, Y, Z gate
-- better T, S gate
-- SWAP gate
-- better control gates
-- BP diff blocks
-
-### Supported Register Operations
-- measure!, measure_reset!, measure_remove!, select
-- append_qudits!, append_qubits!
-- insert_qudit!, insert_qubits!
-- focus!, relax!
-- join
-- density_matrix
-- fidelity (not including density matrix)
-- expect
-
-### Other Operations
-- autodiff is supported when the only parameterized gates are rotation gates in a circuit.
-
-## The Team
-
-This project is an effort of QuantumBFS, an open source organization for quantum science. Yao is currently maintained by [Xiu-Zhe (Roger) Luo](https://github.com/Roger-luo) and [Jin-Guo Liu](https://github.com/GiggleLiu) with contributions from open source community. All the contributors are listed in the [contributors](https://github.com/QuantumBFS/Yao.jl/graphs/contributors).
-
-## License
-
-**CuYao** is released under the Apache 2 license.
diff --git a/ext/CuYao/src/register.jl b/ext/CuYao/src/register.jl
index cc493e94..01c83cbc 100644
--- a/ext/CuYao/src/register.jl
+++ b/ext/CuYao/src/register.jl
@@ -189,11 +189,6 @@ function Yao.insert_qudits!(reg::AbstractCuArrayReg{D}, loc::Int; nqudits::Int=1
     return reg
 end
 
-"""
-    cuproduct_state([T=ComplexF64], total::Int, bit_config::Integer; nbatch=NoBatch())
-
-The GPU version of [`product_state`](@ref).
-"""
 cuproduct_state(bit_str::BitStr; nbatch::Union{NoBatch,Int} = NoBatch()) =
     cuproduct_state(ComplexF64, bit_str; nbatch = nbatch)
 cuproduct_state(bit_str::AbstractVector; nbatch::Union{NoBatch,Int} = NoBatch()) =
@@ -219,11 +214,6 @@ end
 cuzero_state(n::Int; kwargs...) = cuzero_state(ComplexF64, n; kwargs...)
 cuzero_state(::Type{T}, n::Int; kwargs...) where {T} = cuproduct_state(T, n, 0; kwargs...)
 
-"""
-    curand_state([T=ComplexF64], n::Int; nbatch=1)
-
-The GPU version of [`rand_state`](@ref).
-"""
 curand_state(n::Int; kwargs...) = curand_state(ComplexF64, n; kwargs...)
 
 function curand_state(
@@ -236,11 +226,6 @@ function curand_state(
     return normalize!(arrayreg(raw; nbatch=nbatch, nlevel=nlevel))
 end
 
-"""
-    cuuniform_state([T=ComplexF64], n::Int; nbatch=1)
-
-The GPU version of [`uniform_state`](@ref).
-"""
 cuuniform_state(n::Int; kwargs...) = cuuniform_state(ComplexF64, n; kwargs...)
 function cuuniform_state(::Type{T}, n::Int;
     nbatch::Union{Int,NoBatch} = NoBatch(),
@@ -250,11 +235,6 @@ function cuuniform_state(::Type{T}, n::Int;
     return normalize!(arrayreg(raw; nbatch=nbatch, nlevel=nlevel))
 end
 
-"""
-    cughz_state([T=ComplexF64], n::Int; nbatch=1)
-
-The GPU version of [`ghz_state`](@ref).
-"""
 cughz_state(n::Int; kwargs...) = cughz_state(ComplexF64, n; kwargs...)
 function cughz_state(::Type{T}, n::Int; kwargs...) where {T}
     reg = cuzero_state(T, n; kwargs...)
diff --git a/src/Yao.jl b/src/Yao.jl
index 7b1deac1..87d1ac4b 100644
--- a/src/Yao.jl
+++ b/src/Yao.jl
@@ -29,12 +29,10 @@ using YaoBlocks:
 
 export EasyBuild
 # CUDA APIs
-for FT in [:cpu, :cuzero_state, :cuuniform_state, :curand_state, :cuproduct_state, :cughz_state]
-    @eval export $FT
-    @eval function $FT end
-end
+export cpu, cuzero_state, cuuniform_state, curand_state, cuproduct_state, cughz_state
 
-include("deprecations.jl")
+include("cudainterfaces.jl")
 include("EasyBuild/easybuild.jl")
+include("deprecations.jl")
 
 end # module
diff --git a/src/cudainterfaces.jl b/src/cudainterfaces.jl
new file mode 100644
index 00000000..335ba000
--- /dev/null
+++ b/src/cudainterfaces.jl
@@ -0,0 +1,41 @@
+"""
+    cuproduct_state([T=ComplexF64], total::Int, bit_config::Integer; nbatch=NoBatch())
+
+The GPU version of [`product_state`](@ref).
+"""
+function cuproduct_state end
+
+"""
+    curand_state([T=ComplexF64], n::Int; nbatch=1)
+
+The GPU version of [`rand_state`](@ref).
+"""
+function curand_state end
+
+"""
+    cuzero_state([T=ComplexF64], n::Int; nbatch=1)
+
+The GPU version of [`zero_state`](@ref).
+"""
+function cuzero_state end
+
+"""
+    cuuniform_state([T=ComplexF64], n::Int; nbatch=1)
+
+The GPU version of [`uniform_state`](@ref).
+"""
+function cuuniform_state end
+
+"""
+    cughz_state([T=ComplexF64], n::Int; nbatch=1)
+
+The GPU version of [`ghz_state`](@ref).
+"""
+function cughz_state end
+
+"""
+    cpu(cureg)
+
+Download the register state from GPU to CPU.
+"""
+function cpu end
\ No newline at end of file