缓存/重用恒定中间张量

以下代码示例说明了如何激活恒定中间张量的缓存,以便在重复执行张量网络收缩时大幅加速,其中只有部分输入张量在每次迭代中更改其值。完整的代码可以在 NVIDIA/cuQuantum 仓库中找到 (此处)。

头文件和数据类型

  8#include <stdlib.h>
  9#include <stdio.h>
 10
 11#include <unordered_map>
 12#include <vector>
 13#include <cassert>
 14
 15#include <cuda_runtime.h>
 16#include <cutensornet.h>
 17
 18
 19#define HANDLE_ERROR(x)                                           \
 20{ const auto err = x;                                             \
 21  if( err != CUTENSORNET_STATUS_SUCCESS )                         \
 22  { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
 23    fflush(stdout);                                               \
 24  }                                                               \
 25};
 26
 27#define HANDLE_CUDA_ERROR(x)                                      \
 28{ const auto err = x;                                             \
 29  if( err != cudaSuccess )                                        \
 30  { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
 31    fflush(stdout);                                               \
 32  }                                                               \
 33};
 34
 35
 36struct GPUTimer
 37{
 38    GPUTimer(cudaStream_t stream): stream_(stream)
 39    {
 40        HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
 41        HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
 42    }
 43
 44    ~GPUTimer()
 45    {
 46        HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
 47        HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
 48    }
 49
 50    void start()
 51    {
 52        HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_));
 53    }
 54
 55    float seconds()
 56    {
 57        HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
 58        HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
 59        float time;
 60        HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
 61        return time * 1e-3;
 62    }
 63
 64    private:
 65    cudaEvent_t start_, stop_;
 66    cudaStream_t stream_;
 67};
 68
 69
 70int main()
 71{
 72    static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
 73
 74    bool verbose = true;
 75
 76    // Check cuTensorNet version
 77    const size_t cuTensornetVersion = cutensornetGetVersion();
 78    if(verbose)
 79        printf("cuTensorNet version: %ld\n", cuTensornetVersion);
 80
 81    // Set GPU device
 82    int numDevices {0};
 83    HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
 84    const int deviceId = 0;
 85    HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
 86    cudaDeviceProp prop;
 87    HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
 88
 89    if(verbose) {
 90        printf("===== device info ======\n");
 91        printf("GPU-local-id:%d\n", deviceId);
 92        printf("GPU-name:%s\n", prop.name);
 93        printf("GPU-clock:%d\n", prop.clockRate);
 94        printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
 95        printf("GPU-nSM:%d\n", prop.multiProcessorCount);
 96        printf("GPU-major:%d\n", prop.major);
 97        printf("GPU-minor:%d\n", prop.minor);
 98        printf("========================\n");
 99    }
100
101    typedef float floatType;
102    cudaDataType_t typeData = CUDA_R_32F;
103    cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
104
105    if(verbose)
106        printf("Included headers and defined data types\n");

定义张量网络和张量大小

接下来,我们定义张量网络的结构(即,张量的模式、范围及其连接性)。

110    /**********************
111    * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,f,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
112    * We will execute the contraction a few times assuming all input tensors being constant except F.
113    **********************/
114
115    constexpr int32_t numInputs = 6;
116
117    // Create vectors of tensor modes
118    std::vector<std::vector<int32_t>> modesVec {
119        {'a','b','c','d'},
120        {'b','c','d','e'},
121        {'e','f','g','h'},
122        {'g','h','i','j'},
123        {'i','j','k','l'},
124        {'k','l','m'},
125        {'a','m'}
126    };
127
128    // Set mode extents
129    int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
130    std::unordered_map<int32_t, int64_t> extent;
131    for (auto &vec: modesVec)
132    {
133        for (auto &mode: vec)
134        {
135            extent[mode] = sameExtent;
136        }
137    }
138
139    // Create a vector of extents for each tensor
140    std::vector<std::vector<int64_t>> extentVec;
141    extentVec.resize(numInputs+1); // hold inputs + output tensors
142    for (int i = 0; i < numInputs+1; ++i)
143    {
144        for (auto mode : modesVec[i])
145            extentVec[i].push_back(extent[mode]);
146    }
147
148    if(verbose)
149        printf("Defined tensor network, modes, and extents\n");

分配内存、初始化数据、初始化 cuTensorNet 句柄

接下来,我们为张量网络操作数分配内存并将它们初始化为随机值。然后,我们通过 cutensornetCreate() 初始化 cuTensorNet 库。

152    /**********************
153    * Allocating data
154    **********************/
155
156    std::vector<size_t> elementsVec;
157    elementsVec.resize(numInputs+1); // hold inputs + output tensors
158    for (int i = 0; i < numInputs+1; ++i)
159    {
160        elementsVec[i] = 1;
161        for (auto mode : modesVec[i])
162            elementsVec[i] *= extent[mode];
163    }
164
165    size_t totalSize = 0;
166    std::vector<size_t> sizeVec;
167    sizeVec.resize(numInputs+1); // hold inputs + output tensors
168    for (int i = 0; i < numInputs+1; ++i)
169    {
170        sizeVec[i] = sizeof(floatType) * elementsVec[i];
171        totalSize += sizeVec[i];
172    }
173    if(verbose)
174        printf("Total GPU memory used for tensor storage: %.2f GiB\n",
175                (totalSize) / 1024. /1024. / 1024);
176
177    void* rawDataIn_d[numInputs];
178    void* O_d;
179    for (int i = 0; i < numInputs; ++i)
180    {
181        HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[i], sizeVec[i]) );
182    }
183    HANDLE_CUDA_ERROR( cudaMalloc((void**) &O_d, sizeVec[numInputs]));
184
185    floatType* rawDataIn_h[numInputs];
186    for (int i = 0; i < numInputs; ++i)
187    {
188        rawDataIn_h[i] = (floatType*) malloc(sizeof(floatType) * elementsVec[i]);
189        if (rawDataIn_h[i] == NULL)
190        {
191           printf("Error: Host memory allocation failed!\n");
192           return -1;
193        }
194    }
195    floatType *O_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
196    if (O_h == NULL)
197    {
198        printf("Error: Host memory allocation failed!\n");
199        return -1;
200    }
201
202    /*******************
203    * Initialize data
204    *******************/
205
206    memset(O_h, 0, sizeof(floatType) * elementsVec[numInputs]);
207    for (int i = 0; i < numInputs; ++i)
208    {
209        for (size_t e = 0; e < elementsVec[i]; ++e)
210            rawDataIn_h[i][e] = ((floatType) rand()) / RAND_MAX;
211    }
212
213    for (int i = 0; i < numInputs; ++i)
214    {
215        HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[i], rawDataIn_h[i], sizeVec[i], cudaMemcpyHostToDevice) );
216    }
217
218    if(verbose)
219        printf("Allocated GPU memory for data, initialize data, and create library handle\n");
220
221    /*************************
222    * cuTensorNet
223    *************************/
224
225    cudaStream_t stream;
226    HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
227
228    cutensornetHandle_t handle;
229    HANDLE_ERROR( cutensornetCreate(&handle) );

标记恒定张量并创建网络描述符

接下来,我们指定哪些输入张量是恒定的,并使用所需的张量模式、范围、步长和限定符(例如,恒定)以及数据和计算类型创建网络描述符。请注意,创建的库上下文将与当前活动的 GPU 相关联。

232    /*******************************
233    * Set constant input tensors
234    *******************************/
235
236    // specify which input tensors are constant
237    std::vector<cutensornetTensorQualifiers_t> qualifiersIn;
238    qualifiersIn.resize(numInputs);
239    for (int i = 0; i < numInputs; ++i)
240    {
241        if (i < 5)
242            qualifiersIn[i].isConstant = 1;
243        else
244            qualifiersIn[i].isConstant = 0;
245    }
246
247    /*******************************
248    * Create Network Descriptor
249    *******************************/
250
251    int32_t* modesIn[numInputs];
252    int32_t numModesIn[numInputs];
253    int64_t* extentsIn[numInputs];
254    int64_t* stridesIn[numInputs];
255    
256    for (int i = 0; i < numInputs; ++i)
257    {
258        modesIn[i] = modesVec[i].data();
259        numModesIn[i] = modesVec[i].size();
260        extentsIn[i] = extentVec[i].data();
261        stridesIn[i] = NULL; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout
262    }
263
264    // Set up tensor network
265    cutensornetNetworkDescriptor_t descNet;
266    HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle,
267                        numInputs, numModesIn, extentsIn, stridesIn, modesIn, qualifiersIn.data(),
268                        modesVec[numInputs].size(), extentVec[numInputs].data(), /*stridesOut = */NULL, modesVec[numInputs].data(),
269                        typeData, typeCompute,
270                        &descNet) );
271
272    if(verbose)
273        printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");

收缩顺序和切片

在此示例中,我们说明了如何使用预定的收缩路径并通过 cutensornetContractionOptimizerInfoSetAttribute() 将其设置到优化器信息对象中。

276    /*******************************
277    * Choose workspace limit based on available resources.
278    *******************************/
279
280    size_t freeMem, totalMem;
281    HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
282    uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
283    if(verbose)
284        printf("Workspace limit = %lu\n", workspaceLimit);
285
286    /*******************************
287    * Set contraction order
288    *******************************/
289
290    // Create contraction optimizer info
291    cutensornetContractionOptimizerInfo_t optimizerInfo;
292    HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) );
293
294    // set a predetermined contraction path
295    std::vector<int32_t> path{0,1,0,4,0,3,0,2,0,1};
296    const auto numContractions = numInputs - 1;
297    cutensornetContractionPath_t contPath;
298    contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
299    contPath.numContractions = numContractions;
300
301    // provide user-specified contPath
302    HANDLE_ERROR( cutensornetContractionOptimizerInfoSetAttribute(
303                    handle,
304                    optimizerInfo,
305                    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
306                    &contPath,
307                    sizeof(contPath)));
308    int64_t numSlices = 1;
309
310    if(verbose)
311        printf("Set predetermined contraction path into cuTensorNet optimizer\n");

创建工作区描述符并分配工作区内存

接下来,我们创建一个工作区描述符,计算工作区大小,并查询收缩网络所需的最小工作区大小。为了激活中间张量重用,我们需要提供 CACHE 工作区,该工作区将在多个网络收缩中使用。因此,我们查询大小并为两种工作区(CUTENSORNET_WORKSPACE_SCRATCHCUTENSORNET_WORKSPACE_CACHE)分配设备内存,并将这些设置在工作区描述符中。工作区描述符将提供给收缩计划创建和收缩 API。

314    /*******************************
315    * Create workspace descriptor, allocate workspace, and set it.
316    *******************************/
317
318    cutensornetWorkspaceDescriptor_t workDesc;
319    HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
320
321    // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
322    int64_t requiredWorkspaceSizeScratch = 0;
323    HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle,
324                                                            descNet,
325                                                            optimizerInfo,
326                                                            workDesc) );
327
328    HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
329                                                    workDesc,
330                                                    CUTENSORNET_WORKSIZE_PREF_MIN,
331                                                    CUTENSORNET_MEMSPACE_DEVICE,
332                                                    CUTENSORNET_WORKSPACE_SCRATCH,
333                                                    &requiredWorkspaceSizeScratch) );
334
335    void* workScratch = nullptr;
336    HANDLE_CUDA_ERROR( cudaMalloc(&workScratch, requiredWorkspaceSizeScratch) );
337
338    HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
339                                                workDesc,
340                                                CUTENSORNET_MEMSPACE_DEVICE,
341                                                CUTENSORNET_WORKSPACE_SCRATCH,
342                                                workScratch,
343                                                requiredWorkspaceSizeScratch) );
344
345    // set CACHE workspace, which will be used across network contraction operations
346    int64_t requiredWorkspaceSizeCache = 0;
347    HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
348                                                    workDesc,
349                                                    CUTENSORNET_WORKSIZE_PREF_MIN,
350                                                    CUTENSORNET_MEMSPACE_DEVICE,
351                                                    CUTENSORNET_WORKSPACE_CACHE,
352                                                    &requiredWorkspaceSizeCache) );
353
354    void* workCache = nullptr;
355    HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) );
356
357    HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
358                                                workDesc,
359                                                CUTENSORNET_MEMSPACE_DEVICE,
360                                                CUTENSORNET_WORKSPACE_CACHE,
361                                                workCache,
362                                                requiredWorkspaceSizeCache) );
363
364    if(verbose)
365        printf("Allocated and set up the GPU workspace\n");

请注意,可以跳过创建工作区描述符和显式处理工作区内存的步骤,而是通过设置设备内存处理程序,在这种情况下,cuTensorNet 将通过从提供的内存池分配/释放内存来隐式处理工作区内存。有关详细信息,请参阅 内存管理 API

收缩计划和自动调优

我们创建一个张量网络收缩计划,其中包含 cuTENSOR 的所有成对张量收缩计划。可选地,我们可以自动调优计划,以便 cuTENSOR 为每个成对收缩选择最佳内核。此收缩计划可以重用于许多(可能不同的)数据输入,从而避免冗余地初始化此计划的成本。

368    /*******************************
369    * Initialize the pairwise contraction plan (for cuTENSOR).
370    *******************************/
371
372    cutensornetContractionPlan_t plan;
373    HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
374                                                    descNet,
375                                                    optimizerInfo,
376                                                    workDesc,
377                                                    &plan) );
378
379    /*******************************
380    * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
381    *           for each pairwise tensor contraction.
382    *******************************/
383    cutensornetContractionAutotunePreference_t autotunePref;
384    HANDLE_ERROR( cutensornetCreateContractionAutotunePreference(handle,
385                                                        &autotunePref) );
386
387    const int numAutotuningIterations = 5; // may be 0
388    HANDLE_ERROR( cutensornetContractionAutotunePreferenceSetAttribute(
389                            handle,
390                            autotunePref,
391                            CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
392                            &numAutotuningIterations,
393                            sizeof(numAutotuningIterations)) );
394
395    // Modify the plan again to find the best pair-wise contractions
396    HANDLE_ERROR( cutensornetContractionAutotune(handle,
397                                                    plan,
398                                                    rawDataIn_d,
399                                                    O_d,
400                                                    workDesc,
401                                                    autotunePref,
402                                                    stream) );
403
404    HANDLE_ERROR( cutensornetDestroyContractionAutotunePreference(autotunePref) );
405
406    if(verbose)
407        printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n");

张量网络收缩执行

最后,我们根据需要多次收缩张量网络,每次可能使用不同的输入。请注意,第一个网络收缩调用将利用提供的 CACHE 工作区来存储恒定中间张量。后续的网络收缩将使用缓存的数据来大幅减少计算时间。

410    /**********************
411    * Execute the tensor network contraction
412    **********************/
413
414    // Create a cutensornetSliceGroup_t object from a range of slice IDs
415    cutensornetSliceGroup_t sliceGroup{};
416    HANDLE_ERROR( cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup) );
417
418    GPUTimer timer {stream};
419    double minTimeCUTENSORNET = 1e100;
420    double firstTimeCUTENSORNET = 1e100;
421    const int numRuns = 3; // number of repeats to get stable performance results
422    for (int i = 0; i < numRuns; ++i)
423    {
424        HANDLE_CUDA_ERROR( cudaMemcpy(O_d, O_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
425        HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
426
427        /*
428        * Contract all slices of the tensor network
429        */
430        timer.start();
431
432        int32_t accumulateOutput = 0; // output tensor data will be overwritten
433        HANDLE_ERROR( cutensornetContractSlices(handle,
434                        plan,
435                        rawDataIn_d,
436                        O_d,
437                        accumulateOutput,
438                        workDesc,
439                        sliceGroup, // slternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
440                        stream) );
441
442        // Synchronize and measure best timing
443        auto time = timer.seconds();
444        if (i == 0) 
445            firstTimeCUTENSORNET = time;
446        minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
447    }
448
449    if(verbose)
450        printf("Contracted the tensor network, each slice used the same contraction plan\n");
451
452    // Print the 1-norm of the output tensor (verification)
453    HANDLE_CUDA_ERROR( cudaStreamSynchronize(stream) );
454    HANDLE_CUDA_ERROR( cudaMemcpy(O_h, O_d, sizeVec[numInputs], cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
455    double norm1 = 0.0;
456    for (int64_t i = 0; i < elementsVec[numInputs]; ++i) {
457        norm1 += std::abs(O_h[i]);
458    }
459    if(verbose)
460        printf("Computed the 1-norm of the output tensor: %e\n", norm1);
461
462    /*************************/
463
464    // Query the total Flop count for the tensor network contraction
465    double flops {0.0};
466    HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
467                        handle,
468                        optimizerInfo,
469                        CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
470                        &flops,
471                        sizeof(flops)) );
472
473    if(verbose) {
474        printf("Number of tensor network slices = %ld\n", numSlices);
475        printf("Network contraction flop cost = %e\n", flops);
476        printf("Tensor network contraction time (ms):\n");
477        printf("\tfirst run (intermediate tensors get cached) = %.3f\n", firstTimeCUTENSORNET * 1000.f);
478        printf("\tsubsequent run (cache reused) = %.3f\n", minTimeCUTENSORNET * 1000.f);
479    }

释放资源

计算完成后,我们需要释放所有资源。

482    /***************
483    * Free resources
484    ****************/
485
486    // Free cuTensorNet resources
487    HANDLE_ERROR( cutensornetDestroySliceGroup(sliceGroup) );
488    HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) );
489    HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
490    HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) );
491    HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) );
492    HANDLE_ERROR( cutensornetDestroy(handle) );
493
494    // Free Host memory resources
495    if (O_h) free(O_h);
496    for (int i = 0; i < numInputs; ++i)
497    {
498        if (rawDataIn_h[i]) 
499            free(rawDataIn_h[i]);
500    }
501    // Free GPU memory resources
502    if (workScratch) cudaFree(workScratch);
503    if (workCache) cudaFree(workCache);
504    if (O_d) cudaFree(O_d);
505    for (int i = 0; i < numInputs; ++i)
506    {
507        if (rawDataIn_d[i]) 
508            cudaFree(rawDataIn_d[i]);
509    }
510    if(verbose)
511        printf("Freed resources and exited\n");
512
513    return 0;
514}