通过反向传播计算梯度¶
以下代码示例说明了如何通过反向传播计算张量网络相对于用户选择的输入张量的梯度。完整的代码可以在 NVIDIA/cuQuantum 仓库中找到 (此处)。
头文件和数据类型¶
8#include <stdlib.h>
9#include <stdio.h>
10
11#include <unordered_map>
12#include <vector>
13#include <cassert>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18
19#define HANDLE_ERROR(x) \
20{ const auto err = x; \
21 if( err != CUTENSORNET_STATUS_SUCCESS ) \
22 { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
23 fflush(stdout); \
24 } \
25};
26
27#define HANDLE_CUDA_ERROR(x) \
28{ const auto err = x; \
29 if( err != cudaSuccess ) \
30 { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
31 fflush(stdout); \
32 } \
33};
34
35
36struct GPUTimer
37{
38 GPUTimer(cudaStream_t stream): stream_(stream)
39 {
40 HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
41 HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
42 }
43
44 ~GPUTimer()
45 {
46 HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
47 HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
48 }
49
50 void start()
51 {
52 HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_));
53 }
54
55 float seconds()
56 {
57 HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
58 HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
59 float time;
60 HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
61 return time * 1e-3;
62 }
63
64 private:
65 cudaEvent_t start_, stop_;
66 cudaStream_t stream_;
67};
68
69
70int main()
71{
72 static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
73
74 bool verbose = true;
75
76 // Check cuTensorNet version
77 const size_t cuTensornetVersion = cutensornetGetVersion();
78 if(verbose)
79 printf("cuTensorNet version: %ld\n", cuTensornetVersion);
80
81 // Set GPU device
82 int numDevices {0};
83 HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
84 const int deviceId = 0;
85 HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
86 cudaDeviceProp prop;
87 HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
88
89 if(verbose) {
90 printf("===== device info ======\n");
91 printf("GPU-local-id:%d\n", deviceId);
92 printf("GPU-name:%s\n", prop.name);
93 printf("GPU-clock:%d\n", prop.clockRate);
94 printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
95 printf("GPU-nSM:%d\n", prop.multiProcessorCount);
96 printf("GPU-major:%d\n", prop.major);
97 printf("GPU-minor:%d\n", prop.minor);
98 printf("========================\n");
99 }
100
101 typedef float floatType;
102 cudaDataType_t typeData = CUDA_R_32F;
103 cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
104
105 if(verbose)
106 printf("Included headers and defined data types\n");
定义张量网络和张量大小¶
接下来,我们定义张量网络的结构(即,张量的模式、范围及其连通性),并指定将计算其梯度的输入张量 ID。
110 /**********************
111 * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
112 * We will execute the contraction and compute the gradients of input tensors A, B, C
113 **********************/
114
115 constexpr int32_t numInputs = 6;
116 std::vector<int32_t> gradInputIDs = {0, 1, 2};
117
118 // Create vectors of tensor modes
119 std::vector<std::vector<int32_t>> modesVec {
120 {'a','b','c','d'},
121 {'b','c','d','e'},
122 {'e','g','h'},
123 {'g','h','i','j'},
124 {'i','j','k','l'},
125 {'k','l','m'},
126 {'a','m'}
127 };
128
129 // Set mode extents
130 int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
131 std::unordered_map<int32_t, int64_t> extent;
132 for (auto &vec: modesVec)
133 {
134 for (auto &mode: vec)
135 {
136 extent[mode] = sameExtent;
137 }
138 }
139
140 // Create a vector of extents for each tensor
141 std::vector<std::vector<int64_t>> extentVec;
142 extentVec.resize(numInputs+1); // hold inputs + output tensors
143 for (int i = 0; i < numInputs+1; ++i)
144 {
145 for (auto mode : modesVec[i])
146 extentVec[i].push_back(extent[mode]);
147 }
148
149 if(verbose)
150 printf("Defined tensor network, modes, and extents\n");
分配内存、初始化数据、初始化 cuTensorNet 句柄¶
接下来,我们为张量网络操作数分配内存,并将它们初始化为随机值。我们还为选定的用于梯度计算的输入张量分配了梯度张量的内存,以及初始化为 1 的激活张量。然后,我们通过 cutensornetCreate()
初始化 cuTensorNet 库。
153 /**********************
154 * Allocating data
155 **********************/
156
157 std::vector<size_t> elementsVec;
158 elementsVec.resize(numInputs+1); // hold inputs + output tensors
159 for (int i = 0; i < numInputs+1; ++i)
160 {
161 elementsVec[i] = 1;
162 for (auto mode : modesVec[i])
163 elementsVec[i] *= extent[mode];
164 }
165
166 size_t totalSize = 0;
167 std::vector<size_t> sizeVec;
168 sizeVec.resize(numInputs+1); // hold inputs + output tensors
169 for (int i = 0; i < numInputs+1; ++i)
170 {
171 sizeVec[i] = sizeof(floatType) * elementsVec[i];
172 totalSize += sizeVec[i];
173 }
174 if(verbose)
175 printf("Total GPU memory used for tensor storage: %.2f GiB\n",
176 (totalSize) / 1024. /1024. / 1024);
177
178 void* rawDataIn_d[numInputs];
179 void* O_d;
180 void* outputActivation_d;
181 for (int i = 0; i < numInputs; ++i)
182 {
183 HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[i], sizeVec[i]) );
184 }
185 HANDLE_CUDA_ERROR( cudaMalloc((void**) &O_d, sizeVec[numInputs]));
186 HANDLE_CUDA_ERROR( cudaMalloc((void**) &outputActivation_d, sizeVec[numInputs]));
187
188 floatType* rawDataIn_h[numInputs];
189 for (int i = 0; i < numInputs; ++i)
190 {
191 rawDataIn_h[i] = (floatType*) malloc(sizeVec[i]);
192 if (rawDataIn_h[i] == NULL)
193 {
194 printf("Error: Host memory allocation failed!\n");
195 return -1;
196 }
197 }
198 floatType *O_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
199 if (O_h == NULL)
200 {
201 printf("Error: Host memory allocation failed!\n");
202 return -1;
203 }
204 floatType *outputActivation_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
205 if (outputActivation_h == NULL)
206 {
207 printf("Error: Host memory allocation failed!\n");
208 return -1;
209 }
210
211 void* gradientsOut_d[numInputs] = {nullptr};
212 for (auto i : gradInputIDs)
213 {
214 HANDLE_CUDA_ERROR( cudaMalloc((void**) &gradientsOut_d[i], sizeVec[i]) );
215 }
216 void* gradientsOut_h[numInputs] = {nullptr};
217 for (auto i : gradInputIDs)
218 {
219 gradientsOut_h[i] = (floatType*) malloc(sizeVec[i]);
220 if (gradientsOut_h[i] == NULL)
221 {
222 printf("Error: Host memory allocation failed!\n");
223 return -1;
224 }
225 }
226
227 /*******************
228 * Initialize data
229 *******************/
230
231 memset(O_h, 0, sizeof(floatType) * elementsVec[numInputs]);
232 for (int i = 0; i < numInputs; ++i)
233 {
234 for (size_t e = 0; e < elementsVec[i]; ++e)
235 rawDataIn_h[i][e] = ((floatType) rand()) / RAND_MAX;
236 }
237 for (size_t e = 0; e < elementsVec[numInputs]; ++e)
238 outputActivation_h[e] = (floatType) 1.0;
239
240 for (int i = 0; i < numInputs; ++i)
241 {
242 HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[i], rawDataIn_h[i], sizeVec[i], cudaMemcpyHostToDevice) );
243 }
244 HANDLE_CUDA_ERROR( cudaMemcpy(outputActivation_d, outputActivation_h, sizeVec[numInputs], cudaMemcpyHostToDevice) );
245
246 if(verbose)
247 printf("Allocated GPU memory for data, initialize data, and create library handle\n");
248
249 /*************************
250 * cuTensorNet
251 *************************/
252
253 cudaStream_t stream;
254 HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
255
256 cutensornetHandle_t handle;
257 HANDLE_ERROR( cutensornetCreate(&handle) );
创建网络描述符并设置梯度张量 ID¶
接下来,我们使用所需的张量模式、范围和步幅以及数据和计算类型创建网络描述符。我们还在网络描述符处设置了将计算其梯度的输入张量 ID。请注意,创建的库上下文将与当前活动的 GPU 相关联。
260 /*******************************
261 * Create Network Descriptor
262 *******************************/
263
264 int32_t* modesIn[numInputs];
265 int32_t numModesIn[numInputs];
266 int64_t* extentsIn[numInputs];
267 int64_t* stridesIn[numInputs];
268
269 for (int i = 0; i < numInputs; ++i)
270 {
271 modesIn[i] = modesVec[i].data();
272 numModesIn[i] = modesVec[i].size();
273 extentsIn[i] = extentVec[i].data();
274 stridesIn[i] = NULL; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout
275 }
276
277 // Set up tensor network
278 cutensornetNetworkDescriptor_t descNet;
279 HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle,
280 numInputs, numModesIn, extentsIn, stridesIn, modesIn, NULL,
281 modesVec[numInputs].size(), extentVec[numInputs].data(), /*stridesOut = */NULL, modesVec[numInputs].data(),
282 typeData, typeCompute,
283 &descNet) );
284
285 /*******************************
286 * Set input tensor ids that requrie gradient
287 *******************************/
288
289 cutensornetTensorIDList_t tensorIDList {
290 .numTensors = static_cast<int32_t>(gradInputIDs.size()),
291 .data = gradInputIDs.data()
292 };
293
294 HANDLE_ERROR(cutensornetNetworkSetAttribute(handle,
295 descNet,
296 CUTENSORNET_NETWORK_INPUT_TENSORS_REQUIRE_GRAD,
297 &tensorIDList,
298 sizeof(cutensornetTensorIDList_t)));
299
300 if(verbose)
301 printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");
收缩顺序¶
在本示例中,我们说明了如何使用预定的收缩路径,并通过 cutensornetContractionOptimizerInfoSetAttribute()
将其设置到优化器信息对象中。
304 /*******************************
305 * Choose workspace limit based on available resources.
306 *******************************/
307
308 size_t freeMem, totalMem;
309 HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
310 uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
311 if(verbose)
312 printf("Workspace limit = %lu\n", workspaceLimit);
313
314 /*******************************
315 * Set contraction order
316 *******************************/
317
318 // Create contraction optimizer info
319 cutensornetContractionOptimizerInfo_t optimizerInfo;
320 HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) );
321
322 // set a predetermined contraction path
323 std::vector<int32_t> path{0,1,0,4,0,3,0,2,0,1};
324 const auto numContractions = numInputs - 1;
325 cutensornetContractionPath_t contPath;
326 contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
327 contPath.numContractions = numContractions;
328
329 // provide user-specified contPath
330 HANDLE_ERROR( cutensornetContractionOptimizerInfoSetAttribute(
331 handle,
332 optimizerInfo,
333 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
334 &contPath,
335 sizeof(contPath)));
336 int64_t numSlices = 1;
337
338 if(verbose)
339 printf("Set predetermined contraction path into cuTensorNet optimizer\n");
创建工作区描述符并分配工作区内存¶
接下来,我们创建一个工作区描述符,计算工作区大小,并查询收缩网络所需的最小工作区大小。为了启用梯度计算,我们需要提供 CACHE 工作区,该工作区将用于存储反向传播调用所需使用的中间张量数据。因此,我们查询大小并为两种类型的工作区(CUTENSORNET_WORKSPACE_SCRATCH
和 CUTENSORNET_WORKSPACE_CACHE
)分配设备内存,并将这些设置在工作区描述符中。工作区描述符将提供给收缩计划创建、计划收缩和梯度计算 API。
342 /*******************************
343 * Create workspace descriptor, allocate workspace, and set it.
344 *******************************/
345
346 cutensornetWorkspaceDescriptor_t workDesc;
347 HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
348
349 // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
350 int64_t requiredWorkspaceSizeScratch = 0;
351 HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle,
352 descNet,
353 optimizerInfo,
354 workDesc) );
355
356 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
357 workDesc,
358 CUTENSORNET_WORKSIZE_PREF_MIN,
359 CUTENSORNET_MEMSPACE_DEVICE,
360 CUTENSORNET_WORKSPACE_SCRATCH,
361 &requiredWorkspaceSizeScratch) );
362
363 void* workScratch = nullptr;
364 HANDLE_CUDA_ERROR( cudaMalloc(&workScratch, requiredWorkspaceSizeScratch) );
365
366 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
367 workDesc,
368 CUTENSORNET_MEMSPACE_DEVICE,
369 CUTENSORNET_WORKSPACE_SCRATCH,
370 workScratch,
371 requiredWorkspaceSizeScratch) );
372
373 // set CACHE workspace, which will be used across network contraction operations
374 int64_t requiredWorkspaceSizeCache = 0;
375 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
376 workDesc,
377 CUTENSORNET_WORKSIZE_PREF_MIN,
378 CUTENSORNET_MEMSPACE_DEVICE,
379 CUTENSORNET_WORKSPACE_CACHE,
380 &requiredWorkspaceSizeCache) );
381
382 void* workCache = nullptr;
383 HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) );
384
385 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
386 workDesc,
387 CUTENSORNET_MEMSPACE_DEVICE,
388 CUTENSORNET_WORKSPACE_CACHE,
389 workCache,
390 requiredWorkspaceSizeCache) );
391
392 if(verbose)
393 printf("Allocated and set up the GPU workspace\n");
收缩计划和自动调优¶
我们创建一个张量网络收缩计划,其中包含 cuTENSOR 的所有成对张量收缩计划。 可选地,我们可以自动调整计划,以便 cuTENSOR 为每个成对收缩选择最佳内核。 此收缩计划可以为许多(可能不同的)数据输入重用,从而避免冗余地初始化此计划的成本。
396 /*******************************
397 * Initialize the pairwise contraction plan (for cuTENSOR).
398 *******************************/
399
400 cutensornetContractionPlan_t plan;
401 HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
402 descNet,
403 optimizerInfo,
404 workDesc,
405 &plan) );
406
407 /*******************************
408 * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
409 * for each pairwise tensor contraction.
410 *******************************/
411 cutensornetContractionAutotunePreference_t autotunePref;
412 HANDLE_ERROR( cutensornetCreateContractionAutotunePreference(handle,
413 &autotunePref) );
414
415 const int numAutotuningIterations = 5; // may be 0
416 HANDLE_ERROR( cutensornetContractionAutotunePreferenceSetAttribute(
417 handle,
418 autotunePref,
419 CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
420 &numAutotuningIterations,
421 sizeof(numAutotuningIterations)) );
422
423 // Modify the plan again to find the best pair-wise contractions
424 HANDLE_ERROR( cutensornetContractionAutotune(handle,
425 plan,
426 rawDataIn_d,
427 O_d,
428 workDesc,
429 autotunePref,
430 stream) );
431
432 HANDLE_ERROR( cutensornetDestroyContractionAutotunePreference(autotunePref) );
433
434 if(verbose)
435 printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n");
张量网络收缩执行和梯度计算¶
最后,我们根据需要多次收缩张量网络,每次可能使用不同的输入。在收缩网络(这会将中间张量的数据存储在 CACHE 内存中)之后,我们通过反向传播计算所需的梯度。我们还在每次迭代时清除 CACHE 内存,以便为后续调用利用可用的 CACHE 内存腾出空间。
438 /**********************
439 * Execute the tensor network contraction
440 **********************/
441
442 // Create a cutensornetSliceGroup_t object from a range of slice IDs
443 cutensornetSliceGroup_t sliceGroup{};
444 HANDLE_ERROR( cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup) );
445
446 GPUTimer timer {stream};
447 double minTimeCUTENSORNET = 1e100;
448 const int numRuns = 3; // number of repeats to get stable performance results
449 for (int i = 0; i < numRuns; ++i)
450 {
451 HANDLE_CUDA_ERROR( cudaMemcpy(O_d, O_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
452 HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
453
454 /*
455 * Contract all slices of the tensor network
456 */
457 timer.start();
458
459 int32_t accumulateOutput = 0; // output tensor data will be overwritten
460 HANDLE_ERROR( cutensornetContractSlices(handle,
461 plan,
462 rawDataIn_d,
463 O_d,
464 accumulateOutput,
465 workDesc,
466 sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
467 stream) );
468
469 HANDLE_ERROR( cutensornetComputeGradientsBackward(handle,
470 plan,
471 rawDataIn_d,
472 outputActivation_d,
473 gradientsOut_d,
474 accumulateOutput,
475 workDesc,
476 stream) );
477
478 // Purge the cache to make room for the next run to use cache memory
479 HANDLE_ERROR( cutensornetWorkspacePurgeCache(handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE) );
480
481 // Synchronize and measure best timing
482 auto time = timer.seconds();
483 minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
484 }
485
486 if(verbose)
487 printf("Contracted the tensor network and computed gradients\n");
488
489 HANDLE_CUDA_ERROR( cudaMemcpy(O_h, O_d, sizeVec[numInputs], cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
490
491 for (auto i : gradInputIDs)
492 {
493 HANDLE_CUDA_ERROR( cudaMemcpy(gradientsOut_h[i], gradientsOut_d[i], sizeVec[i], cudaMemcpyDeviceToHost) );
494 }
495
496 /*************************/
497
498 if(verbose) {
499 printf("Tensor network contraction and back-propagation time (ms): = %.3f\n", minTimeCUTENSORNET * 1000.f);
500 }
释放资源¶
计算完成后,我们需要释放所有资源。
503 /***************
504 * Free resources
505 ****************/
506
507 // Free cuTensorNet resources
508 HANDLE_ERROR( cutensornetDestroySliceGroup(sliceGroup) );
509 HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) );
510 HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
511 HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) );
512 HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) );
513 HANDLE_ERROR( cutensornetDestroy(handle) );
514
515 // Free Host memory resources
516 if (O_h) free(O_h);
517 if (outputActivation_h) free(outputActivation_h);
518 for (int i = 0; i < numInputs; ++i)
519 {
520 if (rawDataIn_h[i])
521 free(rawDataIn_h[i]);
522 if (gradientsOut_h[i])
523 free(gradientsOut_h[i]);
524 }
525 // Free GPU memory resources
526 if (workScratch) cudaFree(workScratch);
527 if (workCache) cudaFree(workCache);
528 if (O_d) cudaFree(O_d);
529 if (outputActivation_d) cudaFree(outputActivation_d);
530 for (int i = 0; i < numInputs; ++i)
531 {
532 if (rawDataIn_d[i])
533 cudaFree(rawDataIn_d[i]);
534 if (gradientsOut_d[i])
535 cudaFree(gradientsOut_d[i]);
536 }
537 if(verbose)
538 printf("Freed resources and exited\n");
539
540 return 0;
541}