以下代码示例说明了如何集成 cuTensorNet 功能以执行张量 QR 操作。完整代码可以在 NVIDIA/cuQuantum 存储库中找到(此处)。

定义 QR 分解

我们首先定义要执行的 QR 分解,包括数据类型、模式分区和范围。

 97   /**********************************************
 98   * Tensor QR: T_{i,j,m,n} -> Q_{i,x,m} R_{n,x,j}  
 99   ***********************************************/
100
101   typedef float floatType;
102   cudaDataType_t typeData = CUDA_R_32F;
103
104   // Create vector of modes
105   int32_t sharedMode = 'x';
106
107   std::vector<int32_t> modesT{'i','j','m','n'}; // input
108   std::vector<int32_t> modesQ{'i', sharedMode,'m'};
109   std::vector<int32_t> modesR{'n', sharedMode,'j'};  // QR output
110
111   // Extents
112   std::unordered_map<int32_t, int64_t> extentMap;
113   extentMap['i'] = 16;
114   extentMap['j'] = 16;
115   extentMap['m'] = 16;
116   extentMap['n'] = 16;
117
118   int64_t rowExtent = computeCombinedExtent(extentMap, modesQ);
119   int64_t colExtent = computeCombinedExtent(extentMap, modesR);
120   
121   // cuTensorNet tensor QR operates in reduced mode expecting k = min(m, n)
122   extentMap[sharedMode] = rowExtent <= colExtent? rowExtent: colExtent;
123
124   // Create a vector of extents for each tensor
125   std::vector<int64_t> extentT;
126   for (auto mode : modesT)
127      extentT.push_back(extentMap[mode]);
128   std::vector<int64_t> extentQ;
129   for (auto mode : modesQ)
130      extentQ.push_back(extentMap[mode]);
131   std::vector<int64_t> extentR;
132   for (auto mode : modesR)
133      extentR.push_back(extentMap[mode]);

注意

cutensornetTensorQR() 在简化模式下运行,期望输出的共享范围是组合行范围和组合列范围的最小值。

分配内存并初始化数据

接下来,我们为输入和输出张量操作数分配内存。输入操作数初始化为随机值。

136   /***********************************
137   * Allocating data on host and device
138   ************************************/
139
140   size_t elementsT = 1;
141   for (auto mode : modesT)
142      elementsT *= extentMap[mode];
143   size_t elementsQ = 1;
144   for (auto mode : modesQ)
145      elementsQ *= extentMap[mode];
146   size_t elementsR = 1;
147   for (auto mode : modesR)
148      elementsR *= extentMap[mode];
149
150   size_t sizeT = sizeof(floatType) * elementsT;
151   size_t sizeQ = sizeof(floatType) * elementsQ;
152   size_t sizeR = sizeof(floatType) * elementsR;
153
154   printf("Total memory: %.2f GiB\n", (sizeT + sizeQ + sizeR)/1024./1024./1024);
155
156   floatType *T = (floatType*) malloc(sizeT);
157   floatType *Q = (floatType*) malloc(sizeQ);
158   floatType *R = (floatType*) malloc(sizeR);
159
160   if (T == NULL || Q==NULL || R==NULL )
161   {
162      printf("Error: Host allocation of input T or output Q/R.\n");
163      return -1;
164   }
165
166   void* D_T;
167   void* D_Q;
168   void* D_R;
169   
170   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
171   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_Q, sizeQ) );
172   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_R, sizeR) );
173
174   /****************
175   * Initialize data
176   *****************/
177
178   for (uint64_t i = 0; i < elementsT; i++)
179      T[i] = ((floatType) rand())/RAND_MAX;
180
181   HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) );
182   printf("Allocate memory for data, and initialize data.\n");

初始化 cuTensorNet 并创建张量描述符

然后,我们初始化 cuTensorNet 的库句柄,并为输入和输出张量创建 cutensorTensorDescriptor_t

185   /******************
186   * cuTensorNet
187   *******************/
188
189   cudaStream_t stream;
190   HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
191
192   cutensornetHandle_t handle;
193   HANDLE_ERROR( cutensornetCreate(&handle) );
194
195   /***************************
196   * Create tensor descriptors
197   ****************************/
198
199   cutensornetTensorDescriptor_t descTensorIn;
200   cutensornetTensorDescriptor_t descTensorQ;
201   cutensornetTensorDescriptor_t descTensorR;
202
203   const int32_t numModesIn = modesT.size();
204   const int32_t numModesQ = modesQ.size();
205   const int32_t numModesR = modesR.size();
206
207   const int64_t* strides = NULL; // assuming fortran layout for all tensors
208
209   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) );
210   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesQ, extentQ.data(), strides, modesQ.data(), typeData, &descTensorQ) );
211   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesR, extentR.data(), strides, modesR.data(), typeData, &descTensorR) );
212    
213   printf("Initialize the cuTensorNet library and create all tensor descriptors.\n");

查询和分配所需工作区

创建所有张量描述符后,我们可以使用 cutensornetWorkspaceComputeQRSizes() 查询所需的工作区大小。

216   /********************************************
217   * Query and allocate required workspace sizes
218   *********************************************/
219
220   cutensornetWorkspaceDescriptor_t workDesc;
221   HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
222   HANDLE_ERROR( cutensornetWorkspaceComputeQRSizes(handle, descTensorIn, descTensorQ, descTensorR, workDesc) );
223   int64_t hostWorkspaceSize, deviceWorkspaceSize;
224
225   // for tensor QR, it does not matter which cutensornetWorksizePref_t we pick
226   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
227                                                   workDesc,
228                                                   CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
229                                                   CUTENSORNET_MEMSPACE_DEVICE,
230                                                   CUTENSORNET_WORKSPACE_SCRATCH,
231                                                   &deviceWorkspaceSize) );
232   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
233                                                   workDesc,
234                                                   CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
235                                                   CUTENSORNET_MEMSPACE_HOST,
236                                                   CUTENSORNET_WORKSPACE_SCRATCH,
237                                                   &hostWorkspaceSize) );
238
239   void *devWork = nullptr, *hostWork = nullptr;
240   if (deviceWorkspaceSize > 0) {
241      HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) );
242   }
243   if (hostWorkspaceSize > 0) {
244      hostWork = malloc(hostWorkspaceSize);
245   }
246   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
247                                               workDesc,
248                                               CUTENSORNET_MEMSPACE_DEVICE,
249                                               CUTENSORNET_WORKSPACE_SCRATCH,
250                                               devWork,
251                                               deviceWorkspaceSize) );
252   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
253                                               workDesc,
254                                               CUTENSORNET_MEMSPACE_HOST,
255                                               CUTENSORNET_WORKSPACE_SCRATCH,
256                                               hostWork,
257                                               hostWorkspaceSize) );

执行

在此阶段,我们可以通过调用 cutensornetTensorQR() 执行 QR 分解。

260   /**********
261   * Execution
262   ***********/
263  
264   GPUTimer timer{stream};
265   double minTimeCUTENSOR = 1e100;
266   const int numRuns = 3; // to get stable perf results
267   for (int i=0; i < numRuns; ++i)
268   {  
269      // restore output
270      cudaMemsetAsync(D_Q, 0, sizeQ, stream);
271      cudaMemsetAsync(D_R, 0, sizeR, stream);
272      cudaDeviceSynchronize();
273
274      timer.start();
275      HANDLE_ERROR( cutensornetTensorQR(handle, 
276                        descTensorIn, D_T, 
277                        descTensorQ, D_Q, 
278                        descTensorR, D_R, 
279                        workDesc,
280                        stream) );
281      // Synchronize and measure timing
282      auto time = timer.seconds();
283      minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
284   }
285
286   printf("Performing QR\n");
287
288   HANDLE_CUDA_ERROR( cudaMemcpyAsync(Q, D_Q, sizeQ, cudaMemcpyDeviceToHost) );
289   HANDLE_CUDA_ERROR( cudaMemcpyAsync(R, D_R, sizeR, cudaMemcpyDeviceToHost) );
290
291   cudaDeviceSynchronize(); // device synchronization.
292   printf("%.2f ms\n", minTimeCUTENSOR * 1000.f);

释放资源

计算完成后,我们需要释放所有资源。

295   /***************
296   * Free resources
297   ****************/
298   
299   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) );
300   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorQ) );
301   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorR) );
302   HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
303   HANDLE_ERROR( cutensornetDestroy(handle) );
304
305   if (T) free(T);
306   if (Q) free(Q);
307   if (R) free(R);
308   if (D_T) cudaFree(D_T);
309   if (D_Q) cudaFree(D_Q);
310   if (D_R) cudaFree(D_R);
311   if (devWork) cudaFree(devWork);
312   if (hostWork) free(hostWork);
313
314   printf("Free resource and exit.\n");
315
316   return 0;
317}