计算矩阵乘积态边际分布张量

以下代码示例说明了如何定义张量网络状态,计算其矩阵乘积态 (MPS) 分解,然后计算 MPS 分解状态的边际分布张量。完整的代码可以在 NVIDIA/cuQuantum 存储库中找到(此处)。

头文件和错误处理

 7#include <cstdlib>
 8#include <cstdio>
 9#include <cassert>
10#include <complex>
11#include <vector>
12#include <bitset>
13#include <iostream>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18
19#define HANDLE_CUDA_ERROR(x) \
20{ const auto err = x; \
21  if( err != cudaSuccess ) \
22  { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
23};
24
25#define HANDLE_CUTN_ERROR(x) \
26{ const auto err = x; \
27  if( err != CUTENSORNET_STATUS_SUCCESS ) \
28  { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
29};
30
31
32int main()
33{
34  static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
35
36  constexpr std::size_t fp64size = sizeof(double);

定义张量网络状态和所需的边际分布张量

让我们定义一个对应于 16 量子比特量子线路的张量网络状态,并请求量子比特 0 和 1 的边际分布张量。

40  // Quantum state configuration
41  constexpr int32_t numQubits = 16;
42  const std::vector<int64_t> qubitDims(numQubits,2); // qubit dimensions
43  constexpr int32_t numMarginalModes = 2; // rank of the marginal (reduced density matrix)
44  const std::vector<int32_t> marginalModes({0,1}); // open qubits (must be in acsending order)
45  std::cout << "Quantum circuit: " << numQubits << " qubits\n";

初始化 cuTensorNet 库句柄

49  // Initialize the cuTensorNet library
50  HANDLE_CUDA_ERROR(cudaSetDevice(0));
51  cutensornetHandle_t cutnHandle;
52  HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle));
53  std::cout << "Initialized cuTensorNet library on GPU 0\n";

在 GPU 上定义量子门

57  // Define necessary quantum gate tensors in Host memory
58  const double invsq2 = 1.0 / std::sqrt(2.0);
59  //  Hadamard gate
60  const std::vector<std::complex<double>> h_gateH {{invsq2, 0.0},  {invsq2, 0.0},
61                                                   {invsq2, 0.0}, {-invsq2, 0.0}};
62  //  CX gate
63  const std::vector<std::complex<double>> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
64                                                    {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
65                                                    {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0},
66                                                    {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}};
67
68  // Copy quantum gates to Device memory
69  void *d_gateH{nullptr}, *d_gateCX{nullptr};
70  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size)));
71  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size)));
72  std::cout << "Allocated quantum gate memory on GPU\n";
73  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice));
74  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice));
75  std::cout << "Copied quantum gates to GPU memory\n";

分配 MPS 张量

在这里,我们设置 MPS 张量的形状,并为其存储分配 GPU 内存。

79  // Determine the MPS representation and allocate buffers for the MPS tensors
80  const int64_t maxExtent = 2; // GHZ state can be exactly represented with max bond dimension of 2
81  std::vector<std::vector<int64_t>> extents;
82  std::vector<int64_t*> extentsPtr(numQubits); 
83  std::vector<void*> d_mpsTensors(numQubits, nullptr);
84  for (int32_t i = 0; i < numQubits; i++) {
85    if (i == 0) { // left boundary MPS tensor
86      extents.push_back({2, maxExtent});
87      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size));
88    } 
89    else if (i == numQubits-1) { // right boundary MPS tensor
90      extents.push_back({maxExtent, 2});
91      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size));
92    }
93    else { // middle MPS tensors
94      extents.push_back({maxExtent, 2, maxExtent});
95      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * maxExtent * 2 * fp64size));
96    }
97    extentsPtr[i] = extents[i].data();
98  }

在 GPU 上分配边际分布张量

在这里,我们分配边际分布张量,即量子比特 0 和 1 的约化密度矩阵,在 GPU 上。

102  // Allocate the specified quantum circuit reduced density matrix (marginal) in Device memory
103  void *d_rdm{nullptr};
104  std::size_t rdmDim = 1;
105  for(const auto & mode: marginalModes) rdmDim *= qubitDims[mode];
106  const std::size_t rdmSize = rdmDim * rdmDim;
107  HANDLE_CUDA_ERROR(cudaMalloc(&d_rdm, rdmSize * (2 * fp64size)));

在 GPU 上分配暂存缓冲区

111  // Query the free memory on Device
112  std::size_t freeSize{0}, totalSize{0};
113  HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
114  const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment
115  void *d_scratch{nullptr};
116  HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
117  std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU\n";

创建纯张量网络状态

现在,让我们为一个 16 量子比特量子线路创建一个纯张量网络状态。

121  // Create the initial quantum state
122  cutensornetState_t quantumState;
123  HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(),
124                    CUDA_C_64F, &quantumState));
125  std::cout << "Created the initial quantum state\n";

应用量子门

让我们通过应用相应的量子门来构建 GHZ 量子线路。

129  // Construct the final quantum circuit state (apply quantum gates) for the GHZ circuit
130  int64_t id;
131  HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 1, std::vector<int32_t>{{0}}.data(),
132                    d_gateH, nullptr, 1, 0, 1, &id));
133  for(int32_t i = 1; i < numQubits; ++i) {
134    HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 2, std::vector<int32_t>{{i-1,i}}.data(),
135                      d_gateCX, nullptr, 1, 0, 1, &id));
136  }
137  std::cout << "Applied quantum gates\n";

请求最终量子线路状态的 MPS 分解

在这里,我们表达了使用 MPS 分解来分解最终量子线路状态的意图。提供的 MPS 张量的形状指的是 MPS 重整化过程中的最大尺寸限制。最终 MPS 张量的实际计算形状可能更小。此处尚未进行任何计算。

141  // Specify the final target MPS representation (use default fortran strides)
142  HANDLE_CUTN_ERROR(cutensornetStateFinalizeMPS(cutnHandle, quantumState, 
143                    CUTENSORNET_BOUNDARY_CONDITION_OPEN, extentsPtr.data(), /*strides=*/nullptr));
144  std::cout << "Requested the final MPS factorization of the quantum circuit state\n";

配置 MPS 分解过程

在表达了执行最终量子线路状态的 MPS 分解的意图后,我们还可以通过重置不同的选项(例如,SVD 算法)来配置 MPS 分解过程。

148  // Optional, set up the SVD method for MPS truncation.
149  cutensornetTensorSVDAlgo_t algo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; 
150  HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, quantumState, 
151                    CUTENSORNET_STATE_CONFIG_MPS_SVD_ALGO, &algo, sizeof(algo)));
152  std::cout << "Configured the MPS factorization computation\n";

准备 MPS 分解的计算

让我们创建一个工作区描述符并准备 MPS 分解的计算。

156  // Prepare the MPS computation and attach workspace
157  cutensornetWorkspaceDescriptor_t workDesc;
158  HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc));
159  std::cout << "Created the workspace descriptor\n";
160  HANDLE_CUTN_ERROR(cutensornetStatePrepare(cutnHandle, quantumState, scratchSize, workDesc, 0x0));
161  std::cout << "Prepared the computation of the quantum circuit state\n";
162  double flops {0.0};
163  HANDLE_CUTN_ERROR(cutensornetStateGetInfo(cutnHandle, quantumState,
164                    CUTENSORNET_STATE_INFO_FLOPS, &flops, sizeof(flops)));
165  if(flops > 0.0) {
166    std::cout << "Total flop count = " << (flops/1e9) << " GFlop\n";
167  }else if(flops < 0.0) {
168    std::cout << "ERROR: Negative Flop count!\n";
169    std::abort();
170  }
171
172  int64_t worksize {0};
173  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
174                                                      workDesc,
175                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
176                                                      CUTENSORNET_MEMSPACE_DEVICE,
177                                                      CUTENSORNET_WORKSPACE_SCRATCH,
178                                                      &worksize));
179  std::cout << "Scratch GPU workspace size (bytes) for MPS computation = " << worksize << std::endl;
180  if(worksize <= scratchSize) {
181    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
182                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
183  }else{
184    std::cout << "ERROR: Insufficient workspace size on Device!\n";
185    std::abort();
186  }
187  std::cout << "Set the workspace buffer for the MPS factorization computation\n";

计算 MPS 分解

一旦 MPS 分解过程被配置和准备好,让我们计算最终量子线路状态的 MPS 分解。

191  // Execute MPS computation
192  HANDLE_CUTN_ERROR(cutensornetStateCompute(cutnHandle, quantumState, 
193                    workDesc, extentsPtr.data(), /*strides=*/nullptr, d_mpsTensors.data(), 0));
194  std::cout << "Computed the MPS factorization\n";

创建边际分布对象

一旦量子线路构建完成,让我们创建边际分布对象,该对象将计算量子比特 0 和 1 的边际分布张量(约化密度矩阵)。

198  // Specify the desired reduced density matrix (marginal)
199  cutensornetStateMarginal_t marginal;
200  HANDLE_CUTN_ERROR(cutensornetCreateMarginal(cutnHandle, quantumState, numMarginalModes, marginalModes.data(),
201                    0, nullptr, std::vector<int64_t>{{1,2,4,8}}.data(), &marginal)); // using explicit strides
202  std::cout << "Created the specified quantum circuit reduced densitry matrix (marginal)\n";

配置边际分布对象

现在我们可以通过设置张量网络收缩路径查找器要使用的超样本数量来配置边际分布对象。

206  // Configure the computation of the specified quantum circuit reduced density matrix (marginal)
207  const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder
208  HANDLE_CUTN_ERROR(cutensornetMarginalConfigure(cutnHandle, marginal,
209                    CUTENSORNET_MARGINAL_CONFIG_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples)));
210  std::cout << "Configured the specified quantum circuit reduced density matrix (marginal) computation\n";

准备边际分布张量的计算

让我们准备边际分布张量的计算。

214  // Prepare the specified quantum circuit reduced densitry matrix (marginal)
215  HANDLE_CUTN_ERROR(cutensornetMarginalPrepare(cutnHandle, marginal, scratchSize, workDesc, 0x0));
216  std::cout << "Prepared the specified quantum circuit reduced density matrix (marginal)\n";
217  flops = 0.0;
218  HANDLE_CUTN_ERROR(cutensornetMarginalGetInfo(cutnHandle, marginal,
219                    CUTENSORNET_MARGINAL_INFO_FLOPS, &flops, sizeof(flops)));
220  std::cout << "Total flop count = " << (flops/1e9) << " GFlop\n";
221  if(flops <= 0.0) {
222    std::cout << "ERROR: Invalid Flop count!\n";
223    std::abort();
224  }

设置工作区

现在我们可以设置所需的工作区缓冲区。

228  // Attach the workspace buffer
229  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
230                                                      workDesc,
231                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
232                                                      CUTENSORNET_MEMSPACE_DEVICE,
233                                                      CUTENSORNET_WORKSPACE_SCRATCH,
234                                                      &worksize));
235  std::cout << "Required scratch GPU workspace size (bytes) for marginal computation = " << worksize << std::endl;
236  if(worksize <= scratchSize) {
237    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
238                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
239  }else{
240    std::cout << "ERROR: Insufficient workspace size on Device!\n";
241    std::abort();
242  }
243  std::cout << "Set the workspace buffer\n";
244  

计算边际分布张量

一旦一切都设置好,我们计算请求的量子比特 0 和 1 的边际分布张量(约化密度矩阵),将其复制回主机内存,并打印出来。

247  // Compute the specified quantum circuit reduced densitry matrix (marginal)
248  HANDLE_CUTN_ERROR(cutensornetMarginalCompute(cutnHandle, marginal, nullptr, workDesc, d_rdm, 0));
249  std::cout << "Computed the specified quantum circuit reduced density matrix (marginal)\n";
250  std::vector<std::complex<double>> h_rdm(rdmSize);
251  HANDLE_CUDA_ERROR(cudaMemcpy(h_rdm.data(), d_rdm, rdmSize * (2 * fp64size), cudaMemcpyDeviceToHost));
252  std::cout << "Reduced density matrix for " << numMarginalModes << " qubits:\n";
253  for(std::size_t i = 0; i < rdmDim; ++i) {
254    for(std::size_t j = 0; j < rdmDim; ++j) {
255      std::cout << " " << h_rdm[i + j * rdmDim];
256    }
257    std::cout << std::endl;
258  }

释放资源

262  // Destroy the workspace descriptor
263  HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
264  std::cout << "Destroyed the workspace descriptor\n";
265
266  // Destroy the quantum circuit reduced density matrix
267  HANDLE_CUTN_ERROR(cutensornetDestroyMarginal(marginal));
268  std::cout << "Destroyed the quantum circuit state reduced density matrix (marginal)\n";
269
270  // Destroy the quantum circuit state
271  HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
272  std::cout << "Destroyed the quantum circuit state\n";
273
274  for (int32_t i = 0; i < numQubits; i++) {
275    HANDLE_CUDA_ERROR(cudaFree(d_mpsTensors[i]));
276  }
277  HANDLE_CUDA_ERROR(cudaFree(d_scratch));
278  HANDLE_CUDA_ERROR(cudaFree(d_rdm));
279  HANDLE_CUDA_ERROR(cudaFree(d_gateCX));
280  HANDLE_CUDA_ERROR(cudaFree(d_gateH));
281  std::cout << "Freed memory on GPU\n";
282
283  // Finalize the cuTensorNet library
284  HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
285  std::cout << "Finalized the cuTensorNet library\n";
286
287  return 0;
288}