以下代码示例说明了如何集成 cuTensorNet 功能以执行张量 QR 操作。完整代码可以在 NVIDIA/cuQuantum 存储库中找到(此处)。
定义 QR 分解¶
我们首先定义要执行的 QR 分解,包括数据类型、模式分区和范围。
97 /**********************************************
98 * Tensor QR: T_{i,j,m,n} -> Q_{i,x,m} R_{n,x,j}
99 ***********************************************/
100
101 typedef float floatType;
102 cudaDataType_t typeData = CUDA_R_32F;
103
104 // Create vector of modes
105 int32_t sharedMode = 'x';
106
107 std::vector<int32_t> modesT{'i','j','m','n'}; // input
108 std::vector<int32_t> modesQ{'i', sharedMode,'m'};
109 std::vector<int32_t> modesR{'n', sharedMode,'j'}; // QR output
110
111 // Extents
112 std::unordered_map<int32_t, int64_t> extentMap;
113 extentMap['i'] = 16;
114 extentMap['j'] = 16;
115 extentMap['m'] = 16;
116 extentMap['n'] = 16;
117
118 int64_t rowExtent = computeCombinedExtent(extentMap, modesQ);
119 int64_t colExtent = computeCombinedExtent(extentMap, modesR);
120
121 // cuTensorNet tensor QR operates in reduced mode expecting k = min(m, n)
122 extentMap[sharedMode] = rowExtent <= colExtent? rowExtent: colExtent;
123
124 // Create a vector of extents for each tensor
125 std::vector<int64_t> extentT;
126 for (auto mode : modesT)
127 extentT.push_back(extentMap[mode]);
128 std::vector<int64_t> extentQ;
129 for (auto mode : modesQ)
130 extentQ.push_back(extentMap[mode]);
131 std::vector<int64_t> extentR;
132 for (auto mode : modesR)
133 extentR.push_back(extentMap[mode]);
注意
cutensornetTensorQR()
在简化模式下运行,期望输出的共享范围是组合行范围和组合列范围的最小值。
分配内存并初始化数据¶
接下来,我们为输入和输出张量操作数分配内存。输入操作数初始化为随机值。
136 /***********************************
137 * Allocating data on host and device
138 ************************************/
139
140 size_t elementsT = 1;
141 for (auto mode : modesT)
142 elementsT *= extentMap[mode];
143 size_t elementsQ = 1;
144 for (auto mode : modesQ)
145 elementsQ *= extentMap[mode];
146 size_t elementsR = 1;
147 for (auto mode : modesR)
148 elementsR *= extentMap[mode];
149
150 size_t sizeT = sizeof(floatType) * elementsT;
151 size_t sizeQ = sizeof(floatType) * elementsQ;
152 size_t sizeR = sizeof(floatType) * elementsR;
153
154 printf("Total memory: %.2f GiB\n", (sizeT + sizeQ + sizeR)/1024./1024./1024);
155
156 floatType *T = (floatType*) malloc(sizeT);
157 floatType *Q = (floatType*) malloc(sizeQ);
158 floatType *R = (floatType*) malloc(sizeR);
159
160 if (T == NULL || Q==NULL || R==NULL )
161 {
162 printf("Error: Host allocation of input T or output Q/R.\n");
163 return -1;
164 }
165
166 void* D_T;
167 void* D_Q;
168 void* D_R;
169
170 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
171 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_Q, sizeQ) );
172 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_R, sizeR) );
173
174 /****************
175 * Initialize data
176 *****************/
177
178 for (uint64_t i = 0; i < elementsT; i++)
179 T[i] = ((floatType) rand())/RAND_MAX;
180
181 HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) );
182 printf("Allocate memory for data, and initialize data.\n");
初始化 cuTensorNet 并创建张量描述符¶
然后,我们初始化 cuTensorNet 的库句柄,并为输入和输出张量创建 cutensorTensorDescriptor_t
。
185 /******************
186 * cuTensorNet
187 *******************/
188
189 cudaStream_t stream;
190 HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
191
192 cutensornetHandle_t handle;
193 HANDLE_ERROR( cutensornetCreate(&handle) );
194
195 /***************************
196 * Create tensor descriptors
197 ****************************/
198
199 cutensornetTensorDescriptor_t descTensorIn;
200 cutensornetTensorDescriptor_t descTensorQ;
201 cutensornetTensorDescriptor_t descTensorR;
202
203 const int32_t numModesIn = modesT.size();
204 const int32_t numModesQ = modesQ.size();
205 const int32_t numModesR = modesR.size();
206
207 const int64_t* strides = NULL; // assuming fortran layout for all tensors
208
209 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) );
210 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesQ, extentQ.data(), strides, modesQ.data(), typeData, &descTensorQ) );
211 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesR, extentR.data(), strides, modesR.data(), typeData, &descTensorR) );
212
213 printf("Initialize the cuTensorNet library and create all tensor descriptors.\n");
查询和分配所需工作区¶
创建所有张量描述符后,我们可以使用 cutensornetWorkspaceComputeQRSizes()
查询所需的工作区大小。
216 /********************************************
217 * Query and allocate required workspace sizes
218 *********************************************/
219
220 cutensornetWorkspaceDescriptor_t workDesc;
221 HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
222 HANDLE_ERROR( cutensornetWorkspaceComputeQRSizes(handle, descTensorIn, descTensorQ, descTensorR, workDesc) );
223 int64_t hostWorkspaceSize, deviceWorkspaceSize;
224
225 // for tensor QR, it does not matter which cutensornetWorksizePref_t we pick
226 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
227 workDesc,
228 CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
229 CUTENSORNET_MEMSPACE_DEVICE,
230 CUTENSORNET_WORKSPACE_SCRATCH,
231 &deviceWorkspaceSize) );
232 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
233 workDesc,
234 CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
235 CUTENSORNET_MEMSPACE_HOST,
236 CUTENSORNET_WORKSPACE_SCRATCH,
237 &hostWorkspaceSize) );
238
239 void *devWork = nullptr, *hostWork = nullptr;
240 if (deviceWorkspaceSize > 0) {
241 HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) );
242 }
243 if (hostWorkspaceSize > 0) {
244 hostWork = malloc(hostWorkspaceSize);
245 }
246 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
247 workDesc,
248 CUTENSORNET_MEMSPACE_DEVICE,
249 CUTENSORNET_WORKSPACE_SCRATCH,
250 devWork,
251 deviceWorkspaceSize) );
252 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
253 workDesc,
254 CUTENSORNET_MEMSPACE_HOST,
255 CUTENSORNET_WORKSPACE_SCRATCH,
256 hostWork,
257 hostWorkspaceSize) );
执行¶
在此阶段,我们可以通过调用 cutensornetTensorQR()
执行 QR 分解。
260 /**********
261 * Execution
262 ***********/
263
264 GPUTimer timer{stream};
265 double minTimeCUTENSOR = 1e100;
266 const int numRuns = 3; // to get stable perf results
267 for (int i=0; i < numRuns; ++i)
268 {
269 // restore output
270 cudaMemsetAsync(D_Q, 0, sizeQ, stream);
271 cudaMemsetAsync(D_R, 0, sizeR, stream);
272 cudaDeviceSynchronize();
273
274 timer.start();
275 HANDLE_ERROR( cutensornetTensorQR(handle,
276 descTensorIn, D_T,
277 descTensorQ, D_Q,
278 descTensorR, D_R,
279 workDesc,
280 stream) );
281 // Synchronize and measure timing
282 auto time = timer.seconds();
283 minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
284 }
285
286 printf("Performing QR\n");
287
288 HANDLE_CUDA_ERROR( cudaMemcpyAsync(Q, D_Q, sizeQ, cudaMemcpyDeviceToHost) );
289 HANDLE_CUDA_ERROR( cudaMemcpyAsync(R, D_R, sizeR, cudaMemcpyDeviceToHost) );
290
291 cudaDeviceSynchronize(); // device synchronization.
292 printf("%.2f ms\n", minTimeCUTENSOR * 1000.f);
释放资源¶
计算完成后,我们需要释放所有资源。
295 /***************
296 * Free resources
297 ****************/
298
299 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) );
300 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorQ) );
301 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorR) );
302 HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
303 HANDLE_ERROR( cutensornetDestroy(handle) );
304
305 if (T) free(T);
306 if (Q) free(Q);
307 if (R) free(R);
308 if (D_T) cudaFree(D_T);
309 if (D_Q) cudaFree(D_Q);
310 if (D_R) cudaFree(D_R);
311 if (devWork) cudaFree(devWork);
312 if (hostWork) free(hostWork);
313
314 printf("Free resource and exit.\n");
315
316 return 0;
317}