缓存/重用恒定中间张量¶
以下代码示例说明了如何激活恒定中间张量的缓存,以便在重复执行张量网络收缩时大幅加速,其中只有部分输入张量在每次迭代中更改其值。完整的代码可以在 NVIDIA/cuQuantum 仓库中找到 (此处)。
头文件和数据类型¶
8#include <stdlib.h>
9#include <stdio.h>
10
11#include <unordered_map>
12#include <vector>
13#include <cassert>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18
19#define HANDLE_ERROR(x) \
20{ const auto err = x; \
21 if( err != CUTENSORNET_STATUS_SUCCESS ) \
22 { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
23 fflush(stdout); \
24 } \
25};
26
27#define HANDLE_CUDA_ERROR(x) \
28{ const auto err = x; \
29 if( err != cudaSuccess ) \
30 { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
31 fflush(stdout); \
32 } \
33};
34
35
36struct GPUTimer
37{
38 GPUTimer(cudaStream_t stream): stream_(stream)
39 {
40 HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
41 HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
42 }
43
44 ~GPUTimer()
45 {
46 HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
47 HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
48 }
49
50 void start()
51 {
52 HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_));
53 }
54
55 float seconds()
56 {
57 HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
58 HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
59 float time;
60 HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
61 return time * 1e-3;
62 }
63
64 private:
65 cudaEvent_t start_, stop_;
66 cudaStream_t stream_;
67};
68
69
70int main()
71{
72 static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
73
74 bool verbose = true;
75
76 // Check cuTensorNet version
77 const size_t cuTensornetVersion = cutensornetGetVersion();
78 if(verbose)
79 printf("cuTensorNet version: %ld\n", cuTensornetVersion);
80
81 // Set GPU device
82 int numDevices {0};
83 HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
84 const int deviceId = 0;
85 HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
86 cudaDeviceProp prop;
87 HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
88
89 if(verbose) {
90 printf("===== device info ======\n");
91 printf("GPU-local-id:%d\n", deviceId);
92 printf("GPU-name:%s\n", prop.name);
93 printf("GPU-clock:%d\n", prop.clockRate);
94 printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
95 printf("GPU-nSM:%d\n", prop.multiProcessorCount);
96 printf("GPU-major:%d\n", prop.major);
97 printf("GPU-minor:%d\n", prop.minor);
98 printf("========================\n");
99 }
100
101 typedef float floatType;
102 cudaDataType_t typeData = CUDA_R_32F;
103 cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
104
105 if(verbose)
106 printf("Included headers and defined data types\n");
定义张量网络和张量大小¶
接下来,我们定义张量网络的结构(即,张量的模式、范围及其连接性)。
110 /**********************
111 * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,f,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
112 * We will execute the contraction a few times assuming all input tensors being constant except F.
113 **********************/
114
115 constexpr int32_t numInputs = 6;
116
117 // Create vectors of tensor modes
118 std::vector<std::vector<int32_t>> modesVec {
119 {'a','b','c','d'},
120 {'b','c','d','e'},
121 {'e','f','g','h'},
122 {'g','h','i','j'},
123 {'i','j','k','l'},
124 {'k','l','m'},
125 {'a','m'}
126 };
127
128 // Set mode extents
129 int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
130 std::unordered_map<int32_t, int64_t> extent;
131 for (auto &vec: modesVec)
132 {
133 for (auto &mode: vec)
134 {
135 extent[mode] = sameExtent;
136 }
137 }
138
139 // Create a vector of extents for each tensor
140 std::vector<std::vector<int64_t>> extentVec;
141 extentVec.resize(numInputs+1); // hold inputs + output tensors
142 for (int i = 0; i < numInputs+1; ++i)
143 {
144 for (auto mode : modesVec[i])
145 extentVec[i].push_back(extent[mode]);
146 }
147
148 if(verbose)
149 printf("Defined tensor network, modes, and extents\n");
分配内存、初始化数据、初始化 cuTensorNet 句柄¶
接下来,我们为张量网络操作数分配内存并将它们初始化为随机值。然后,我们通过 cutensornetCreate()
初始化 cuTensorNet 库。
152 /**********************
153 * Allocating data
154 **********************/
155
156 std::vector<size_t> elementsVec;
157 elementsVec.resize(numInputs+1); // hold inputs + output tensors
158 for (int i = 0; i < numInputs+1; ++i)
159 {
160 elementsVec[i] = 1;
161 for (auto mode : modesVec[i])
162 elementsVec[i] *= extent[mode];
163 }
164
165 size_t totalSize = 0;
166 std::vector<size_t> sizeVec;
167 sizeVec.resize(numInputs+1); // hold inputs + output tensors
168 for (int i = 0; i < numInputs+1; ++i)
169 {
170 sizeVec[i] = sizeof(floatType) * elementsVec[i];
171 totalSize += sizeVec[i];
172 }
173 if(verbose)
174 printf("Total GPU memory used for tensor storage: %.2f GiB\n",
175 (totalSize) / 1024. /1024. / 1024);
176
177 void* rawDataIn_d[numInputs];
178 void* O_d;
179 for (int i = 0; i < numInputs; ++i)
180 {
181 HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[i], sizeVec[i]) );
182 }
183 HANDLE_CUDA_ERROR( cudaMalloc((void**) &O_d, sizeVec[numInputs]));
184
185 floatType* rawDataIn_h[numInputs];
186 for (int i = 0; i < numInputs; ++i)
187 {
188 rawDataIn_h[i] = (floatType*) malloc(sizeof(floatType) * elementsVec[i]);
189 if (rawDataIn_h[i] == NULL)
190 {
191 printf("Error: Host memory allocation failed!\n");
192 return -1;
193 }
194 }
195 floatType *O_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
196 if (O_h == NULL)
197 {
198 printf("Error: Host memory allocation failed!\n");
199 return -1;
200 }
201
202 /*******************
203 * Initialize data
204 *******************/
205
206 memset(O_h, 0, sizeof(floatType) * elementsVec[numInputs]);
207 for (int i = 0; i < numInputs; ++i)
208 {
209 for (size_t e = 0; e < elementsVec[i]; ++e)
210 rawDataIn_h[i][e] = ((floatType) rand()) / RAND_MAX;
211 }
212
213 for (int i = 0; i < numInputs; ++i)
214 {
215 HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[i], rawDataIn_h[i], sizeVec[i], cudaMemcpyHostToDevice) );
216 }
217
218 if(verbose)
219 printf("Allocated GPU memory for data, initialize data, and create library handle\n");
220
221 /*************************
222 * cuTensorNet
223 *************************/
224
225 cudaStream_t stream;
226 HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
227
228 cutensornetHandle_t handle;
229 HANDLE_ERROR( cutensornetCreate(&handle) );
标记恒定张量并创建网络描述符¶
接下来,我们指定哪些输入张量是恒定的,并使用所需的张量模式、范围、步长和限定符(例如,恒定)以及数据和计算类型创建网络描述符。请注意,创建的库上下文将与当前活动的 GPU 相关联。
232 /*******************************
233 * Set constant input tensors
234 *******************************/
235
236 // specify which input tensors are constant
237 std::vector<cutensornetTensorQualifiers_t> qualifiersIn;
238 qualifiersIn.resize(numInputs);
239 for (int i = 0; i < numInputs; ++i)
240 {
241 if (i < 5)
242 qualifiersIn[i].isConstant = 1;
243 else
244 qualifiersIn[i].isConstant = 0;
245 }
246
247 /*******************************
248 * Create Network Descriptor
249 *******************************/
250
251 int32_t* modesIn[numInputs];
252 int32_t numModesIn[numInputs];
253 int64_t* extentsIn[numInputs];
254 int64_t* stridesIn[numInputs];
255
256 for (int i = 0; i < numInputs; ++i)
257 {
258 modesIn[i] = modesVec[i].data();
259 numModesIn[i] = modesVec[i].size();
260 extentsIn[i] = extentVec[i].data();
261 stridesIn[i] = NULL; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout
262 }
263
264 // Set up tensor network
265 cutensornetNetworkDescriptor_t descNet;
266 HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle,
267 numInputs, numModesIn, extentsIn, stridesIn, modesIn, qualifiersIn.data(),
268 modesVec[numInputs].size(), extentVec[numInputs].data(), /*stridesOut = */NULL, modesVec[numInputs].data(),
269 typeData, typeCompute,
270 &descNet) );
271
272 if(verbose)
273 printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");
收缩顺序和切片¶
在此示例中,我们说明了如何使用预定的收缩路径并通过 cutensornetContractionOptimizerInfoSetAttribute()
将其设置到优化器信息对象中。
276 /*******************************
277 * Choose workspace limit based on available resources.
278 *******************************/
279
280 size_t freeMem, totalMem;
281 HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
282 uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
283 if(verbose)
284 printf("Workspace limit = %lu\n", workspaceLimit);
285
286 /*******************************
287 * Set contraction order
288 *******************************/
289
290 // Create contraction optimizer info
291 cutensornetContractionOptimizerInfo_t optimizerInfo;
292 HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) );
293
294 // set a predetermined contraction path
295 std::vector<int32_t> path{0,1,0,4,0,3,0,2,0,1};
296 const auto numContractions = numInputs - 1;
297 cutensornetContractionPath_t contPath;
298 contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
299 contPath.numContractions = numContractions;
300
301 // provide user-specified contPath
302 HANDLE_ERROR( cutensornetContractionOptimizerInfoSetAttribute(
303 handle,
304 optimizerInfo,
305 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
306 &contPath,
307 sizeof(contPath)));
308 int64_t numSlices = 1;
309
310 if(verbose)
311 printf("Set predetermined contraction path into cuTensorNet optimizer\n");
创建工作区描述符并分配工作区内存¶
接下来,我们创建一个工作区描述符,计算工作区大小,并查询收缩网络所需的最小工作区大小。为了激活中间张量重用,我们需要提供 CACHE 工作区,该工作区将在多个网络收缩中使用。因此,我们查询大小并为两种工作区(CUTENSORNET_WORKSPACE_SCRATCH
和 CUTENSORNET_WORKSPACE_CACHE
)分配设备内存,并将这些设置在工作区描述符中。工作区描述符将提供给收缩计划创建和收缩 API。
314 /*******************************
315 * Create workspace descriptor, allocate workspace, and set it.
316 *******************************/
317
318 cutensornetWorkspaceDescriptor_t workDesc;
319 HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
320
321 // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
322 int64_t requiredWorkspaceSizeScratch = 0;
323 HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle,
324 descNet,
325 optimizerInfo,
326 workDesc) );
327
328 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
329 workDesc,
330 CUTENSORNET_WORKSIZE_PREF_MIN,
331 CUTENSORNET_MEMSPACE_DEVICE,
332 CUTENSORNET_WORKSPACE_SCRATCH,
333 &requiredWorkspaceSizeScratch) );
334
335 void* workScratch = nullptr;
336 HANDLE_CUDA_ERROR( cudaMalloc(&workScratch, requiredWorkspaceSizeScratch) );
337
338 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
339 workDesc,
340 CUTENSORNET_MEMSPACE_DEVICE,
341 CUTENSORNET_WORKSPACE_SCRATCH,
342 workScratch,
343 requiredWorkspaceSizeScratch) );
344
345 // set CACHE workspace, which will be used across network contraction operations
346 int64_t requiredWorkspaceSizeCache = 0;
347 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
348 workDesc,
349 CUTENSORNET_WORKSIZE_PREF_MIN,
350 CUTENSORNET_MEMSPACE_DEVICE,
351 CUTENSORNET_WORKSPACE_CACHE,
352 &requiredWorkspaceSizeCache) );
353
354 void* workCache = nullptr;
355 HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) );
356
357 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
358 workDesc,
359 CUTENSORNET_MEMSPACE_DEVICE,
360 CUTENSORNET_WORKSPACE_CACHE,
361 workCache,
362 requiredWorkspaceSizeCache) );
363
364 if(verbose)
365 printf("Allocated and set up the GPU workspace\n");
请注意,可以跳过创建工作区描述符和显式处理工作区内存的步骤,而是通过设置设备内存处理程序,在这种情况下,cuTensorNet 将通过从提供的内存池分配/释放内存来隐式处理工作区内存。有关详细信息,请参阅 内存管理 API。
收缩计划和自动调优¶
我们创建一个张量网络收缩计划,其中包含 cuTENSOR 的所有成对张量收缩计划。可选地,我们可以自动调优计划,以便 cuTENSOR 为每个成对收缩选择最佳内核。此收缩计划可以重用于许多(可能不同的)数据输入,从而避免冗余地初始化此计划的成本。
368 /*******************************
369 * Initialize the pairwise contraction plan (for cuTENSOR).
370 *******************************/
371
372 cutensornetContractionPlan_t plan;
373 HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
374 descNet,
375 optimizerInfo,
376 workDesc,
377 &plan) );
378
379 /*******************************
380 * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
381 * for each pairwise tensor contraction.
382 *******************************/
383 cutensornetContractionAutotunePreference_t autotunePref;
384 HANDLE_ERROR( cutensornetCreateContractionAutotunePreference(handle,
385 &autotunePref) );
386
387 const int numAutotuningIterations = 5; // may be 0
388 HANDLE_ERROR( cutensornetContractionAutotunePreferenceSetAttribute(
389 handle,
390 autotunePref,
391 CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
392 &numAutotuningIterations,
393 sizeof(numAutotuningIterations)) );
394
395 // Modify the plan again to find the best pair-wise contractions
396 HANDLE_ERROR( cutensornetContractionAutotune(handle,
397 plan,
398 rawDataIn_d,
399 O_d,
400 workDesc,
401 autotunePref,
402 stream) );
403
404 HANDLE_ERROR( cutensornetDestroyContractionAutotunePreference(autotunePref) );
405
406 if(verbose)
407 printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n");
张量网络收缩执行¶
最后,我们根据需要多次收缩张量网络,每次可能使用不同的输入。请注意,第一个网络收缩调用将利用提供的 CACHE 工作区来存储恒定中间张量。后续的网络收缩将使用缓存的数据来大幅减少计算时间。
410 /**********************
411 * Execute the tensor network contraction
412 **********************/
413
414 // Create a cutensornetSliceGroup_t object from a range of slice IDs
415 cutensornetSliceGroup_t sliceGroup{};
416 HANDLE_ERROR( cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup) );
417
418 GPUTimer timer {stream};
419 double minTimeCUTENSORNET = 1e100;
420 double firstTimeCUTENSORNET = 1e100;
421 const int numRuns = 3; // number of repeats to get stable performance results
422 for (int i = 0; i < numRuns; ++i)
423 {
424 HANDLE_CUDA_ERROR( cudaMemcpy(O_d, O_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
425 HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
426
427 /*
428 * Contract all slices of the tensor network
429 */
430 timer.start();
431
432 int32_t accumulateOutput = 0; // output tensor data will be overwritten
433 HANDLE_ERROR( cutensornetContractSlices(handle,
434 plan,
435 rawDataIn_d,
436 O_d,
437 accumulateOutput,
438 workDesc,
439 sliceGroup, // slternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
440 stream) );
441
442 // Synchronize and measure best timing
443 auto time = timer.seconds();
444 if (i == 0)
445 firstTimeCUTENSORNET = time;
446 minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
447 }
448
449 if(verbose)
450 printf("Contracted the tensor network, each slice used the same contraction plan\n");
451
452 // Print the 1-norm of the output tensor (verification)
453 HANDLE_CUDA_ERROR( cudaStreamSynchronize(stream) );
454 HANDLE_CUDA_ERROR( cudaMemcpy(O_h, O_d, sizeVec[numInputs], cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
455 double norm1 = 0.0;
456 for (int64_t i = 0; i < elementsVec[numInputs]; ++i) {
457 norm1 += std::abs(O_h[i]);
458 }
459 if(verbose)
460 printf("Computed the 1-norm of the output tensor: %e\n", norm1);
461
462 /*************************/
463
464 // Query the total Flop count for the tensor network contraction
465 double flops {0.0};
466 HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
467 handle,
468 optimizerInfo,
469 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
470 &flops,
471 sizeof(flops)) );
472
473 if(verbose) {
474 printf("Number of tensor network slices = %ld\n", numSlices);
475 printf("Network contraction flop cost = %e\n", flops);
476 printf("Tensor network contraction time (ms):\n");
477 printf("\tfirst run (intermediate tensors get cached) = %.3f\n", firstTimeCUTENSORNET * 1000.f);
478 printf("\tsubsequent run (cache reused) = %.3f\n", minTimeCUTENSORNET * 1000.f);
479 }
释放资源¶
计算完成后,我们需要释放所有资源。
482 /***************
483 * Free resources
484 ****************/
485
486 // Free cuTensorNet resources
487 HANDLE_ERROR( cutensornetDestroySliceGroup(sliceGroup) );
488 HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) );
489 HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
490 HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) );
491 HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) );
492 HANDLE_ERROR( cutensornetDestroy(handle) );
493
494 // Free Host memory resources
495 if (O_h) free(O_h);
496 for (int i = 0; i < numInputs; ++i)
497 {
498 if (rawDataIn_h[i])
499 free(rawDataIn_h[i]);
500 }
501 // Free GPU memory resources
502 if (workScratch) cudaFree(workScratch);
503 if (workCache) cudaFree(workCache);
504 if (O_d) cudaFree(O_d);
505 for (int i = 0; i < numInputs; ++i)
506 {
507 if (rawDataIn_d[i])
508 cudaFree(rawDataIn_d[i]);
509 }
510 if(verbose)
511 printf("Freed resources and exited\n");
512
513 return 0;
514}