通过反向传播计算梯度

以下代码示例说明了如何通过反向传播计算张量网络相对于用户选择的输入张量的梯度。完整的代码可以在 NVIDIA/cuQuantum 仓库中找到 (此处)。

头文件和数据类型

  8#include <stdlib.h>
  9#include <stdio.h>
 10
 11#include <unordered_map>
 12#include <vector>
 13#include <cassert>
 14
 15#include <cuda_runtime.h>
 16#include <cutensornet.h>
 17
 18
 19#define HANDLE_ERROR(x)                                           \
 20{ const auto err = x;                                             \
 21  if( err != CUTENSORNET_STATUS_SUCCESS )                         \
 22  { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
 23    fflush(stdout);                                               \
 24  }                                                               \
 25};
 26
 27#define HANDLE_CUDA_ERROR(x)                                      \
 28{ const auto err = x;                                             \
 29  if( err != cudaSuccess )                                        \
 30  { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
 31    fflush(stdout);                                               \
 32  }                                                               \
 33};
 34
 35
 36struct GPUTimer
 37{
 38    GPUTimer(cudaStream_t stream): stream_(stream)
 39    {
 40        HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
 41        HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
 42    }
 43
 44    ~GPUTimer()
 45    {
 46        HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
 47        HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
 48    }
 49
 50    void start()
 51    {
 52        HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_));
 53    }
 54
 55    float seconds()
 56    {
 57        HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
 58        HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
 59        float time;
 60        HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
 61        return time * 1e-3;
 62    }
 63
 64    private:
 65    cudaEvent_t start_, stop_;
 66    cudaStream_t stream_;
 67};
 68
 69
 70int main()
 71{
 72    static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
 73
 74    bool verbose = true;
 75
 76    // Check cuTensorNet version
 77    const size_t cuTensornetVersion = cutensornetGetVersion();
 78    if(verbose)
 79        printf("cuTensorNet version: %ld\n", cuTensornetVersion);
 80
 81    // Set GPU device
 82    int numDevices {0};
 83    HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
 84    const int deviceId = 0;
 85    HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
 86    cudaDeviceProp prop;
 87    HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
 88
 89    if(verbose) {
 90        printf("===== device info ======\n");
 91        printf("GPU-local-id:%d\n", deviceId);
 92        printf("GPU-name:%s\n", prop.name);
 93        printf("GPU-clock:%d\n", prop.clockRate);
 94        printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
 95        printf("GPU-nSM:%d\n", prop.multiProcessorCount);
 96        printf("GPU-major:%d\n", prop.major);
 97        printf("GPU-minor:%d\n", prop.minor);
 98        printf("========================\n");
 99    }
100
101    typedef float floatType;
102    cudaDataType_t typeData = CUDA_R_32F;
103    cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
104
105    if(verbose)
106        printf("Included headers and defined data types\n");

定义张量网络和张量大小

接下来,我们定义张量网络的结构(即,张量的模式、范围及其连通性),并指定将计算其梯度的输入张量 ID。

110    /**********************
111    * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
112    * We will execute the contraction and compute the gradients of input tensors A, B, C
113    **********************/
114
115    constexpr int32_t numInputs = 6;
116    std::vector<int32_t> gradInputIDs = {0, 1, 2};
117
118    // Create vectors of tensor modes
119    std::vector<std::vector<int32_t>> modesVec {
120        {'a','b','c','d'},
121        {'b','c','d','e'},
122        {'e','g','h'},
123        {'g','h','i','j'},
124        {'i','j','k','l'},
125        {'k','l','m'},
126        {'a','m'}
127    };
128
129    // Set mode extents
130    int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
131    std::unordered_map<int32_t, int64_t> extent;
132    for (auto &vec: modesVec)
133    {
134        for (auto &mode: vec)
135        {
136            extent[mode] = sameExtent;
137        }
138    }
139
140    // Create a vector of extents for each tensor
141    std::vector<std::vector<int64_t>> extentVec;
142    extentVec.resize(numInputs+1); // hold inputs + output tensors
143    for (int i = 0; i < numInputs+1; ++i)
144    {
145        for (auto mode : modesVec[i])
146            extentVec[i].push_back(extent[mode]);
147    }
148
149    if(verbose)
150        printf("Defined tensor network, modes, and extents\n");

分配内存、初始化数据、初始化 cuTensorNet 句柄

接下来,我们为张量网络操作数分配内存,并将它们初始化为随机值。我们还为选定的用于梯度计算的输入张量分配了梯度张量的内存,以及初始化为 1 的激活张量。然后,我们通过 cutensornetCreate() 初始化 cuTensorNet 库。

153    /**********************
154    * Allocating data
155    **********************/
156
157    std::vector<size_t> elementsVec;
158    elementsVec.resize(numInputs+1); // hold inputs + output tensors
159    for (int i = 0; i < numInputs+1; ++i)
160    {
161        elementsVec[i] = 1;
162        for (auto mode : modesVec[i])
163            elementsVec[i] *= extent[mode];
164    }
165
166    size_t totalSize = 0;
167    std::vector<size_t> sizeVec;
168    sizeVec.resize(numInputs+1); // hold inputs + output tensors
169    for (int i = 0; i < numInputs+1; ++i)
170    {
171        sizeVec[i] = sizeof(floatType) * elementsVec[i];
172        totalSize += sizeVec[i];
173    }
174    if(verbose)
175        printf("Total GPU memory used for tensor storage: %.2f GiB\n",
176                (totalSize) / 1024. /1024. / 1024);
177
178    void* rawDataIn_d[numInputs];
179    void* O_d;
180    void* outputActivation_d;
181    for (int i = 0; i < numInputs; ++i)
182    {
183        HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[i], sizeVec[i]) );
184    }
185    HANDLE_CUDA_ERROR( cudaMalloc((void**) &O_d, sizeVec[numInputs]));
186    HANDLE_CUDA_ERROR( cudaMalloc((void**) &outputActivation_d, sizeVec[numInputs]));
187
188    floatType* rawDataIn_h[numInputs];
189    for (int i = 0; i < numInputs; ++i)
190    {
191        rawDataIn_h[i] = (floatType*) malloc(sizeVec[i]);
192        if (rawDataIn_h[i] == NULL)
193        {
194           printf("Error: Host memory allocation failed!\n");
195           return -1;
196        }
197    }
198    floatType *O_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
199    if (O_h == NULL)
200    {
201        printf("Error: Host memory allocation failed!\n");
202        return -1;
203    }
204    floatType *outputActivation_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
205    if (outputActivation_h == NULL)
206    {
207        printf("Error: Host memory allocation failed!\n");
208        return -1;
209    }
210
211    void* gradientsOut_d[numInputs] = {nullptr};
212    for (auto i : gradInputIDs)
213    {
214        HANDLE_CUDA_ERROR( cudaMalloc((void**) &gradientsOut_d[i], sizeVec[i]) );
215    }
216    void* gradientsOut_h[numInputs] = {nullptr};
217    for (auto i : gradInputIDs)
218    {
219        gradientsOut_h[i] = (floatType*) malloc(sizeVec[i]);
220        if (gradientsOut_h[i] == NULL)
221        {
222           printf("Error: Host memory allocation failed!\n");
223           return -1;
224        }
225    }
226
227    /*******************
228    * Initialize data
229    *******************/
230
231    memset(O_h, 0, sizeof(floatType) * elementsVec[numInputs]);
232    for (int i = 0; i < numInputs; ++i)
233    {
234        for (size_t e = 0; e < elementsVec[i]; ++e)
235            rawDataIn_h[i][e] = ((floatType) rand()) / RAND_MAX;
236    }
237    for (size_t e = 0; e < elementsVec[numInputs]; ++e)
238        outputActivation_h[e] = (floatType) 1.0;
239
240    for (int i = 0; i < numInputs; ++i)
241    {
242        HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[i], rawDataIn_h[i], sizeVec[i], cudaMemcpyHostToDevice) );
243    }
244    HANDLE_CUDA_ERROR( cudaMemcpy(outputActivation_d, outputActivation_h, sizeVec[numInputs], cudaMemcpyHostToDevice) );
245
246    if(verbose)
247        printf("Allocated GPU memory for data, initialize data, and create library handle\n");
248
249    /*************************
250    * cuTensorNet
251    *************************/
252
253    cudaStream_t stream;
254    HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
255
256    cutensornetHandle_t handle;
257    HANDLE_ERROR( cutensornetCreate(&handle) );

创建网络描述符并设置梯度张量 ID

接下来,我们使用所需的张量模式、范围和步幅以及数据和计算类型创建网络描述符。我们还在网络描述符处设置了将计算其梯度的输入张量 ID。请注意,创建的库上下文将与当前活动的 GPU 相关联。

260    /*******************************
261    * Create Network Descriptor
262    *******************************/
263
264    int32_t* modesIn[numInputs];
265    int32_t numModesIn[numInputs];
266    int64_t* extentsIn[numInputs];
267    int64_t* stridesIn[numInputs];
268    
269    for (int i = 0; i < numInputs; ++i)
270    {
271        modesIn[i] = modesVec[i].data();
272        numModesIn[i] = modesVec[i].size();
273        extentsIn[i] = extentVec[i].data();
274        stridesIn[i] = NULL; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout
275    }
276
277    // Set up tensor network
278    cutensornetNetworkDescriptor_t descNet;
279    HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle,
280                        numInputs, numModesIn, extentsIn, stridesIn, modesIn, NULL,
281                        modesVec[numInputs].size(), extentVec[numInputs].data(), /*stridesOut = */NULL, modesVec[numInputs].data(),
282                        typeData, typeCompute,
283                        &descNet) );
284
285    /*******************************
286    * Set input tensor ids that requrie gradient
287    *******************************/
288
289    cutensornetTensorIDList_t tensorIDList {
290        .numTensors = static_cast<int32_t>(gradInputIDs.size()),
291        .data = gradInputIDs.data()
292    };
293
294    HANDLE_ERROR(cutensornetNetworkSetAttribute(handle,
295                                                descNet,
296                                                CUTENSORNET_NETWORK_INPUT_TENSORS_REQUIRE_GRAD,
297                                                &tensorIDList,
298                                                sizeof(cutensornetTensorIDList_t)));
299
300    if(verbose)
301        printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");

收缩顺序

在本示例中,我们说明了如何使用预定的收缩路径,并通过 cutensornetContractionOptimizerInfoSetAttribute() 将其设置到优化器信息对象中。

304    /*******************************
305    * Choose workspace limit based on available resources.
306    *******************************/
307
308    size_t freeMem, totalMem;
309    HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
310    uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
311    if(verbose)
312        printf("Workspace limit = %lu\n", workspaceLimit);
313
314    /*******************************
315    * Set contraction order
316    *******************************/
317
318    // Create contraction optimizer info
319    cutensornetContractionOptimizerInfo_t optimizerInfo;
320    HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) );
321
322    // set a predetermined contraction path
323    std::vector<int32_t> path{0,1,0,4,0,3,0,2,0,1};
324    const auto numContractions = numInputs - 1;
325    cutensornetContractionPath_t contPath;
326    contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
327    contPath.numContractions = numContractions;
328
329    // provide user-specified contPath
330    HANDLE_ERROR( cutensornetContractionOptimizerInfoSetAttribute(
331                    handle,
332                    optimizerInfo,
333                    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
334                    &contPath,
335                    sizeof(contPath)));
336    int64_t numSlices = 1;
337
338    if(verbose)
339        printf("Set predetermined contraction path into cuTensorNet optimizer\n");

创建工作区描述符并分配工作区内存

接下来,我们创建一个工作区描述符,计算工作区大小,并查询收缩网络所需的最小工作区大小。为了启用梯度计算,我们需要提供 CACHE 工作区,该工作区将用于存储反向传播调用所需使用的中间张量数据。因此,我们查询大小并为两种类型的工作区(CUTENSORNET_WORKSPACE_SCRATCHCUTENSORNET_WORKSPACE_CACHE)分配设备内存,并将这些设置在工作区描述符中。工作区描述符将提供给收缩计划创建、计划收缩和梯度计算 API。

342    /*******************************
343    * Create workspace descriptor, allocate workspace, and set it.
344    *******************************/
345
346    cutensornetWorkspaceDescriptor_t workDesc;
347    HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
348
349    // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
350    int64_t requiredWorkspaceSizeScratch = 0;
351    HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle,
352                                                            descNet,
353                                                            optimizerInfo,
354                                                            workDesc) );
355
356    HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
357                                                    workDesc,
358                                                    CUTENSORNET_WORKSIZE_PREF_MIN,
359                                                    CUTENSORNET_MEMSPACE_DEVICE,
360                                                    CUTENSORNET_WORKSPACE_SCRATCH,
361                                                    &requiredWorkspaceSizeScratch) );
362
363    void* workScratch = nullptr;
364    HANDLE_CUDA_ERROR( cudaMalloc(&workScratch, requiredWorkspaceSizeScratch) );
365
366    HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
367                                                workDesc,
368                                                CUTENSORNET_MEMSPACE_DEVICE,
369                                                CUTENSORNET_WORKSPACE_SCRATCH,
370                                                workScratch,
371                                                requiredWorkspaceSizeScratch) );
372
373    // set CACHE workspace, which will be used across network contraction operations
374    int64_t requiredWorkspaceSizeCache = 0;
375    HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
376                                                    workDesc,
377                                                    CUTENSORNET_WORKSIZE_PREF_MIN,
378                                                    CUTENSORNET_MEMSPACE_DEVICE,
379                                                    CUTENSORNET_WORKSPACE_CACHE,
380                                                    &requiredWorkspaceSizeCache) );
381
382    void* workCache = nullptr;
383    HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) );
384
385    HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
386                                                workDesc,
387                                                CUTENSORNET_MEMSPACE_DEVICE,
388                                                CUTENSORNET_WORKSPACE_CACHE,
389                                                workCache,
390                                                requiredWorkspaceSizeCache) );
391
392    if(verbose)
393        printf("Allocated and set up the GPU workspace\n");

收缩计划和自动调优

我们创建一个张量网络收缩计划,其中包含 cuTENSOR 的所有成对张量收缩计划。 可选地,我们可以自动调整计划,以便 cuTENSOR 为每个成对收缩选择最佳内核。 此收缩计划可以为许多(可能不同的)数据输入重用,从而避免冗余地初始化此计划的成本。

396    /*******************************
397    * Initialize the pairwise contraction plan (for cuTENSOR).
398    *******************************/
399
400    cutensornetContractionPlan_t plan;
401    HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
402                                                    descNet,
403                                                    optimizerInfo,
404                                                    workDesc,
405                                                    &plan) );
406
407    /*******************************
408    * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
409    *           for each pairwise tensor contraction.
410    *******************************/
411    cutensornetContractionAutotunePreference_t autotunePref;
412    HANDLE_ERROR( cutensornetCreateContractionAutotunePreference(handle,
413                                                        &autotunePref) );
414
415    const int numAutotuningIterations = 5; // may be 0
416    HANDLE_ERROR( cutensornetContractionAutotunePreferenceSetAttribute(
417                            handle,
418                            autotunePref,
419                            CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
420                            &numAutotuningIterations,
421                            sizeof(numAutotuningIterations)) );
422
423    // Modify the plan again to find the best pair-wise contractions
424    HANDLE_ERROR( cutensornetContractionAutotune(handle,
425                                                    plan,
426                                                    rawDataIn_d,
427                                                    O_d,
428                                                    workDesc,
429                                                    autotunePref,
430                                                    stream) );
431
432    HANDLE_ERROR( cutensornetDestroyContractionAutotunePreference(autotunePref) );
433
434    if(verbose)
435        printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n");

张量网络收缩执行和梯度计算

最后,我们根据需要多次收缩张量网络,每次可能使用不同的输入。在收缩网络(这会将中间张量的数据存储在 CACHE 内存中)之后,我们通过反向传播计算所需的梯度。我们还在每次迭代时清除 CACHE 内存,以便为后续调用利用可用的 CACHE 内存腾出空间。

438    /**********************
439    * Execute the tensor network contraction
440    **********************/
441
442   // Create a cutensornetSliceGroup_t object from a range of slice IDs
443   cutensornetSliceGroup_t sliceGroup{};
444   HANDLE_ERROR( cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup) );
445
446    GPUTimer timer {stream};
447    double minTimeCUTENSORNET = 1e100;
448    const int numRuns = 3; // number of repeats to get stable performance results
449    for (int i = 0; i < numRuns; ++i)
450    {
451        HANDLE_CUDA_ERROR( cudaMemcpy(O_d, O_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
452        HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
453
454        /*
455        * Contract all slices of the tensor network
456        */
457        timer.start();
458
459        int32_t accumulateOutput = 0; // output tensor data will be overwritten
460        HANDLE_ERROR( cutensornetContractSlices(handle,
461                       plan,
462                       rawDataIn_d,
463                       O_d,
464                       accumulateOutput,
465                       workDesc,
466                       sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
467                       stream) );
468
469        HANDLE_ERROR( cutensornetComputeGradientsBackward(handle,
470                        plan,
471                        rawDataIn_d,
472                        outputActivation_d, 
473                        gradientsOut_d,
474                        accumulateOutput,
475                        workDesc,
476                        stream) );
477
478        // Purge the cache to make room for the next run to use cache memory
479        HANDLE_ERROR( cutensornetWorkspacePurgeCache(handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE) );
480
481        // Synchronize and measure best timing
482        auto time = timer.seconds();
483        minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
484    }
485
486    if(verbose)
487        printf("Contracted the tensor network and computed gradients\n");
488
489    HANDLE_CUDA_ERROR( cudaMemcpy(O_h, O_d, sizeVec[numInputs], cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
490    
491    for (auto i : gradInputIDs)
492    {
493        HANDLE_CUDA_ERROR( cudaMemcpy(gradientsOut_h[i], gradientsOut_d[i], sizeVec[i], cudaMemcpyDeviceToHost) );
494    }
495
496    /*************************/
497
498    if(verbose) {
499        printf("Tensor network contraction and back-propagation time (ms): = %.3f\n", minTimeCUTENSORNET * 1000.f);
500    }

释放资源

计算完成后,我们需要释放所有资源。

503    /***************
504    * Free resources
505    ****************/
506
507    // Free cuTensorNet resources
508    HANDLE_ERROR( cutensornetDestroySliceGroup(sliceGroup) );
509    HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) );
510    HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
511    HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) );
512    HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) );
513    HANDLE_ERROR( cutensornetDestroy(handle) );
514
515    // Free Host memory resources
516    if (O_h) free(O_h);
517    if (outputActivation_h) free(outputActivation_h);
518    for (int i = 0; i < numInputs; ++i)
519    {
520        if (rawDataIn_h[i]) 
521            free(rawDataIn_h[i]);
522        if (gradientsOut_h[i]) 
523            free(gradientsOut_h[i]);
524    }
525    // Free GPU memory resources
526    if (workScratch) cudaFree(workScratch);
527    if (workCache) cudaFree(workCache);
528    if (O_d) cudaFree(O_d);
529    if (outputActivation_d) cudaFree(outputActivation_d);
530    for (int i = 0; i < numInputs; ++i)
531    {
532        if (rawDataIn_d[i]) 
533            cudaFree(rawDataIn_d[i]);
534        if (gradientsOut_d[i]) 
535            cudaFree(gradientsOut_d[i]);
536    }
537    if(verbose)
538        printf("Freed resources and exited\n");
539
540    return 0;
541}