入门指南¶
在本节中,我们将展示如何使用 cuTENSOR 实现第一个张量缩并。我们的代码将使用单精度算术计算以下操作。
我们逐步构建代码,每一步都在末尾添加代码。这些步骤用多个星号组成的注释分隔。
安装与编译¶
cuTENSOR 的下载地址在此处。假设 cuTENSOR 已下载到 CUTENSOR_DOWNLOAD_FILE,我们将其解压,记住 cuTENSOR 的根位置,并相应地更新库路径(在本示例中,我们使用的是 10.1 版本,根据您的工具包,您可能需要选择不同的版本)。
tar xf ${CUTENSOR_DOWNLOAD_FILE}
export CUTENSOR_ROOT=${PWD}/libcutensor
export LD_LIBRARY_PATH=${CUTENSOR_ROOT}/lib/12/:${LD_LIBRARY_PATH}
如果我们将以下代码存储在名为 contraction.cu 的文件中,我们可以通过以下命令进行编译
nvcc contraction.cu -L${CUTENSOR_ROOT}/lib/12/ -I${CUTENSOR_ROOT}/include -std=c++11 -lcutensor -o contraction
在编译此示例的中间步骤时,编译器可能会警告未使用变量。这是由于示例不完整。最后一步应该不会发出警告。
头文件和数据类型¶
首先,我们从一个简单的 main()
函数开始,包含一些头文件,并定义一些数据类型。
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cutensor.h>
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// CUDA types
cutensorDataType_t typeA = CUTENSOR_R_32F;
cutensorDataType_t typeB = CUTENSOR_R_32F;
cutensorDataType_t typeC = CUTENSOR_R_32F;
cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
return 0;
}
定义张量大小¶
接下来,我们定义张量的模式和范围。为了示例的目的,我们假设模式 \(m\)、\(n\) 和 \(u\) 的范围为 96;并且 \(v\)、\(h\) 和 \(k\) 的范围为 64。请注意模式是如何用整数标记的,以及这如何允许我们使用字符标记模式。有关术语模式和范围的解释,请参阅 术语表。
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cutensor.h>
#include <unordered_map>
#include <vector>
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// CUDA types
cutensorDataType_t typeA = CUTENSOR_R_32F;
cutensorDataType_t typeB = CUTENSOR_R_32F;
cutensorDataType_t typeC = CUTENSOR_R_32F;
cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
// Create vector of modes
std::vector<int> modeC{'m','u','n','v'};
std::vector<int> modeA{'m','h','k','n'};
std::vector<int> modeB{'u','k','v','h'};
int nmodeA = modeA.size();
int nmodeB = modeB.size();
int nmodeC = modeC.size();
// Extents
std::unordered_map<int, int64_t> extent;
extent['m'] = 96;
extent['n'] = 96;
extent['u'] = 96;
extent['v'] = 64;
extent['h'] = 64;
extent['k'] = 64;
// Create a vector of extents for each tensor
std::vector<int64_t> extentC;
for(auto mode : modeC)
extentC.push_back(extent[mode]);
std::vector<int64_t> extentA;
for(auto mode : modeA)
extentA.push_back(extent[mode]);
std::vector<int64_t> extentB;
for(auto mode : modeB)
extentB.push_back(extent[mode]);
printf("Define modes and extents\n");
return 0;
}
初始化张量数据¶
接下来,我们需要为张量分配和初始化主机和设备内存
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cutensor.h>
#include <unordered_map>
#include <vector>
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// CUDA types
cutensorDataType_t typeA = CUTENSOR_R_32F;
cutensorDataType_t typeB = CUTENSOR_R_32F;
cutensorDataType_t typeC = CUTENSOR_R_32F;
cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
// Create vector of modes
std::vector<int> modeC{'m','u','n','v'};
std::vector<int> modeA{'m','h','k','n'};
std::vector<int> modeB{'u','k','v','h'};
int nmodeA = modeA.size();
int nmodeB = modeB.size();
int nmodeC = modeC.size();
// Extents
std::unordered_map<int, int64_t> extent;
extent['m'] = 96;
extent['n'] = 96;
extent['u'] = 96;
extent['v'] = 64;
extent['h'] = 64;
extent['k'] = 64;
// Create a vector of extents for each tensor
std::vector<int64_t> extentC;
for(auto mode : modeC)
extentC.push_back(extent[mode]);
std::vector<int64_t> extentA;
for(auto mode : modeA)
extentA.push_back(extent[mode]);
std::vector<int64_t> extentB;
for(auto mode : modeB)
extentB.push_back(extent[mode]);
printf("Define modes and extents\n");
/* ***************************** */
// Number of elements of each tensor
size_t elementsA = 1;
for(auto mode : modeA)
elementsA *= extent[mode];
size_t elementsB = 1;
for(auto mode : modeB)
elementsB *= extent[mode];
size_t elementsC = 1;
for(auto mode : modeC)
elementsC *= extent[mode];
// Size in bytes
size_t sizeA = sizeof(floatTypeA) * elementsA;
size_t sizeB = sizeof(floatTypeB) * elementsB;
size_t sizeC = sizeof(floatTypeC) * elementsC;
// Allocate on device
void *A_d, *B_d, *C_d;
cudaMalloc((void**)&A_d, sizeA);
cudaMalloc((void**)&B_d, sizeB);
cudaMalloc((void**)&C_d, sizeC);
// Allocate on host
floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);
// Initialize data on host
for(int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for(int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for(int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
// Copy to device
cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice);
cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice);
printf("Allocate, initialize and transfer tensors\n");
/**********************
* Free allocated data
**********************/
if (A) free(A);
if (B) free(B);
if (C) free(C);
if (A_d) cudaFree(A_d);
if (B_d) cudaFree(B_d);
if (C_d) cudaFree(C_d);
return 0;
}
创建张量描述符¶
现在我们准备好使用 cuTENSOR 库并初始化其库句柄。然后,我们通过提供其数据类型、阶数、数据类型和对齐方式(为此张量描述符创建的数据指针的对齐方式,以字节为单位)来为每个张量创建一个描述符。
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cutensor.h>
#include <unordered_map>
#include <vector>
// Handle cuTENSOR errors
#define HANDLE_ERROR(x) \
{ const auto err = x; \
if( err != CUTENSOR_STATUS_SUCCESS ) \
{ printf("Error: %s\n", cutensorGetErrorString(err)); exit(-1); } \
};
#define HANDLE_CUDA_ERROR(x) \
{ const auto err = x; \
if( err != cudaSuccess ) \
{ printf("Error: %s\n", cudaGetErrorString(err)); exit(-1); } \
};
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// CUDA types
cutensorDataType_t typeA = CUTENSOR_R_32F;
cutensorDataType_t typeB = CUTENSOR_R_32F;
cutensorDataType_t typeC = CUTENSOR_R_32F;
cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
// Create vector of modes
std::vector<int> modeC{'m','u','n','v'};
std::vector<int> modeA{'m','h','k','n'};
std::vector<int> modeB{'u','k','v','h'};
int nmodeA = modeA.size();
int nmodeB = modeB.size();
int nmodeC = modeC.size();
// Extents
std::unordered_map<int, int64_t> extent;
extent['m'] = 96;
extent['n'] = 96;
extent['u'] = 96;
extent['v'] = 64;
extent['h'] = 64;
extent['k'] = 64;
// Create a vector of extents for each tensor
std::vector<int64_t> extentC;
for(auto mode : modeC)
extentC.push_back(extent[mode]);
std::vector<int64_t> extentA;
for(auto mode : modeA)
extentA.push_back(extent[mode]);
std::vector<int64_t> extentB;
for(auto mode : modeB)
extentB.push_back(extent[mode]);
printf("Define modes and extents\n");
/* ***************************** */
// Number of elements of each tensor
size_t elementsA = 1;
for(auto mode : modeA)
elementsA *= extent[mode];
size_t elementsB = 1;
for(auto mode : modeB)
elementsB *= extent[mode];
size_t elementsC = 1;
for(auto mode : modeC)
elementsC *= extent[mode];
// Size in bytes
size_t sizeA = sizeof(floatTypeA) * elementsA;
size_t sizeB = sizeof(floatTypeB) * elementsB;
size_t sizeC = sizeof(floatTypeC) * elementsC;
// Allocate on device
void *A_d, *B_d, *C_d;
cudaMalloc((void**)&A_d, sizeA);
cudaMalloc((void**)&B_d, sizeB);
cudaMalloc((void**)&C_d, sizeC);
// Allocate on host
floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);
// Initialize data on host
for(int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for(int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for(int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
// Copy to device
HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));
const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes)
assert(uintptr_t(A_d) % kAlignment == 0);
assert(uintptr_t(B_d) % kAlignment == 0);
assert(uintptr_t(C_d) % kAlignment == 0);
printf("Allocate, initialize and transfer tensors\n");
/*************************
* cuTENSOR
*************************/
cutensorHandle_t handle;
HANDLE_ERROR(cutensorCreate(&handle));
/**********************
* Create Tensor Descriptors
**********************/
cutensorTensorDescriptor_t descA;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descA,
nmodeA,
extentA.data(),
NULL,/*stride*/
typeA, kAlignment));
cutensorTensorDescriptor_t descB;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descB,
nmodeB,
extentB.data(),
NULL,/*stride*/
typeB, kAlignment));
cutensorTensorDescriptor_t descC;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descC,
nmodeC,
extentC.data(),
NULL,/*stride*/
typeC, kAlignment));
printf("Initialize cuTENSOR and tensor descriptors\n");
return 0;
}
创建缩并描述符¶
在这一步中,我们创建操作描述符,用于编码缩并并确保标量类型正确
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cutensor.h>
#include <unordered_map>
#include <vector>
// Handle cuTENSOR errors
#define HANDLE_ERROR(x) \
{ const auto err = x; \
if( err != CUTENSOR_STATUS_SUCCESS ) \
{ printf("Error: %s\n", cutensorGetErrorString(err)); exit(-1); } \
};
#define HANDLE_CUDA_ERROR(x) \
{ const auto err = x; \
if( err != cudaSuccess ) \
{ printf("Error: %s\n", cudaGetErrorString(err)); exit(-1); } \
};
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// CUDA types
cutensorDataType_t typeA = CUTENSOR_R_32F;
cutensorDataType_t typeB = CUTENSOR_R_32F;
cutensorDataType_t typeC = CUTENSOR_R_32F;
cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
// Create vector of modes
std::vector<int> modeC{'m','u','n','v'};
std::vector<int> modeA{'m','h','k','n'};
std::vector<int> modeB{'u','k','v','h'};
int nmodeA = modeA.size();
int nmodeB = modeB.size();
int nmodeC = modeC.size();
// Extents
std::unordered_map<int, int64_t> extent;
extent['m'] = 96;
extent['n'] = 96;
extent['u'] = 96;
extent['v'] = 64;
extent['h'] = 64;
extent['k'] = 64;
// Create a vector of extents for each tensor
std::vector<int64_t> extentC;
for(auto mode : modeC)
extentC.push_back(extent[mode]);
std::vector<int64_t> extentA;
for(auto mode : modeA)
extentA.push_back(extent[mode]);
std::vector<int64_t> extentB;
for(auto mode : modeB)
extentB.push_back(extent[mode]);
printf("Define modes and extents\n");
/* ***************************** */
// Number of elements of each tensor
size_t elementsA = 1;
for(auto mode : modeA)
elementsA *= extent[mode];
size_t elementsB = 1;
for(auto mode : modeB)
elementsB *= extent[mode];
size_t elementsC = 1;
for(auto mode : modeC)
elementsC *= extent[mode];
// Size in bytes
size_t sizeA = sizeof(floatTypeA) * elementsA;
size_t sizeB = sizeof(floatTypeB) * elementsB;
size_t sizeC = sizeof(floatTypeC) * elementsC;
// Allocate on device
void *A_d, *B_d, *C_d;
cudaMalloc((void**)&A_d, sizeA);
cudaMalloc((void**)&B_d, sizeB);
cudaMalloc((void**)&C_d, sizeC);
// Allocate on host
floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);
// Initialize data on host
for(int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for(int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for(int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
// Copy to device
HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));
const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes)
assert(uintptr_t(A_d) % kAlignment == 0);
assert(uintptr_t(B_d) % kAlignment == 0);
assert(uintptr_t(C_d) % kAlignment == 0);
printf("Allocate, initialize and transfer tensors\n");
/*************************
* cuTENSOR
*************************/
cutensorHandle_t handle;
HANDLE_ERROR(cutensorCreate(&handle));
/**********************
* Create Tensor Descriptors
**********************/
cutensorTensorDescriptor_t descA;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descA,
nmodeA,
extentA.data(),
NULL,/*stride*/
typeA, kAlignment));
cutensorTensorDescriptor_t descB;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descB,
nmodeB,
extentB.data(),
NULL,/*stride*/
typeB, kAlignment));
cutensorTensorDescriptor_t descC;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descC,
nmodeC,
extentC.data(),
NULL,/*stride*/
typeC, kAlignment));
printf("Initialize cuTENSOR and tensor descriptors\n");
/*******************************
* Create Contraction Descriptor
*******************************/
cutensorOperationDescriptor_t desc;
HANDLE_ERROR(cutensorCreateContraction(handle,
&desc,
descA, modeA.data(), /* unary operator A*/CUTENSOR_OP_IDENTITY,
descB, modeB.data(), /* unary operator B*/CUTENSOR_OP_IDENTITY,
descC, modeC.data(), /* unary operator C*/CUTENSOR_OP_IDENTITY,
descC, modeC.data(),
descCompute));
/*****************************
* Optional (but recommended): ensure that the scalar type is correct.
*****************************/
cutensorDataType_t scalarType;
HANDLE_ERROR(cutensorOperationDescriptorGetAttribute(handle,
desc,
CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
(void*)&scalarType,
sizeof(scalarType)));
assert(scalarType == CUTENSOR_R_32F);
typedef float floatTypeCompute;
floatTypeCompute alpha = (floatTypeCompute)1.1f;
floatTypeCompute beta = (floatTypeCompute)0.f;
return 0;
}
确定算法和工作空间¶
现在我们已经定义了张量和我们要执行的缩并,我们必须选择一种算法来执行缩并。该算法由 cutensorAlgo_t 指定。指定 CUTENSOR_ALGO_DEFAULT
允许我们让 cuTENSOR 的内部启发式算法选择最佳方法。查找良好算法的所有信息都存储在 cutensorPlanPreference_t 数据结构中。我们还可以查询库以估计提供的操作描述符的工作空间需求量;用户可以在不同的 cutensorWorksizePreference_t 之间进行选择。对于此示例,我们使用 CUTENSOR_WORKSPACE_DEFAULT
,这是一个很好的默认选择,旨在实现高性能,同时减少工作空间需求。虽然工作空间内存不是严格要求的,但强烈建议使用以获得高性能。
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cutensor.h>
#include <unordered_map>
#include <vector>
// Handle cuTENSOR errors
#define HANDLE_ERROR(x) \
{ const auto err = x; \
if( err != CUTENSOR_STATUS_SUCCESS ) \
{ printf("Error: %s\n", cutensorGetErrorString(err)); exit(-1); } \
};
#define HANDLE_CUDA_ERROR(x) \
{ const auto err = x; \
if( err != cudaSuccess ) \
{ printf("Error: %s\n", cudaGetErrorString(err)); exit(-1); } \
};
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// CUDA types
cutensorDataType_t typeA = CUTENSOR_R_32F;
cutensorDataType_t typeB = CUTENSOR_R_32F;
cutensorDataType_t typeC = CUTENSOR_R_32F;
cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
// Create vector of modes
std::vector<int> modeC{'m','u','n','v'};
std::vector<int> modeA{'m','h','k','n'};
std::vector<int> modeB{'u','k','v','h'};
int nmodeA = modeA.size();
int nmodeB = modeB.size();
int nmodeC = modeC.size();
// Extents
std::unordered_map<int, int64_t> extent;
extent['m'] = 96;
extent['n'] = 96;
extent['u'] = 96;
extent['v'] = 64;
extent['h'] = 64;
extent['k'] = 64;
// Create a vector of extents for each tensor
std::vector<int64_t> extentC;
for(auto mode : modeC)
extentC.push_back(extent[mode]);
std::vector<int64_t> extentA;
for(auto mode : modeA)
extentA.push_back(extent[mode]);
std::vector<int64_t> extentB;
for(auto mode : modeB)
extentB.push_back(extent[mode]);
printf("Define modes and extents\n");
/* ***************************** */
// Number of elements of each tensor
size_t elementsA = 1;
for(auto mode : modeA)
elementsA *= extent[mode];
size_t elementsB = 1;
for(auto mode : modeB)
elementsB *= extent[mode];
size_t elementsC = 1;
for(auto mode : modeC)
elementsC *= extent[mode];
// Size in bytes
size_t sizeA = sizeof(floatTypeA) * elementsA;
size_t sizeB = sizeof(floatTypeB) * elementsB;
size_t sizeC = sizeof(floatTypeC) * elementsC;
// Allocate on device
void *A_d, *B_d, *C_d;
cudaMalloc((void**)&A_d, sizeA);
cudaMalloc((void**)&B_d, sizeB);
cudaMalloc((void**)&C_d, sizeC);
// Allocate on host
floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);
// Initialize data on host
for(int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for(int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for(int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
// Copy to device
HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));
const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes)
assert(uintptr_t(A_d) % kAlignment == 0);
assert(uintptr_t(B_d) % kAlignment == 0);
assert(uintptr_t(C_d) % kAlignment == 0);
printf("Allocate, initialize and transfer tensors\n");
/*************************
* cuTENSOR
*************************/
cutensorHandle_t handle;
HANDLE_ERROR(cutensorCreate(&handle));
/**********************
* Create Tensor Descriptors
**********************/
cutensorTensorDescriptor_t descA;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descA,
nmodeA,
extentA.data(),
NULL,/*stride*/
typeA, kAlignment));
cutensorTensorDescriptor_t descB;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descB,
nmodeB,
extentB.data(),
NULL,/*stride*/
typeB, kAlignment));
cutensorTensorDescriptor_t descC;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descC,
nmodeC,
extentC.data(),
NULL,/*stride*/
typeC, kAlignment));
printf("Initialize cuTENSOR and tensor descriptors\n");
/*******************************
* Create Contraction Descriptor
*******************************/
cutensorOperationDescriptor_t desc;
HANDLE_ERROR(cutensorCreateContraction(handle,
&desc,
descA, modeA.data(), /* unary operator A*/CUTENSOR_OP_IDENTITY,
descB, modeB.data(), /* unary operator B*/CUTENSOR_OP_IDENTITY,
descC, modeC.data(), /* unary operator C*/CUTENSOR_OP_IDENTITY,
descC, modeC.data(),
descCompute));
/*****************************
* Optional (but recommended): ensure that the scalar type is correct.
*****************************/
cutensorDataType_t scalarType;
HANDLE_ERROR(cutensorOperationDescriptorGetAttribute(handle,
desc,
CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
(void*)&scalarType,
sizeof(scalarType)));
assert(scalarType == CUTENSOR_R_32F);
typedef float floatTypeCompute;
floatTypeCompute alpha = (floatTypeCompute)1.1f;
floatTypeCompute beta = (floatTypeCompute)0.f;
/**************************
* Set the algorithm to use
***************************/
const cutensorAlgo_t algo = CUTENSOR_ALGO_DEFAULT;
cutensorPlanPreference_t planPref;
HANDLE_ERROR(cutensorCreatePlanPreference(
handle,
&planPref,
algo,
CUTENSOR_JIT_MODE_NONE));
/**********************
* Query workspace estimate
**********************/
uint64_t workspaceSizeEstimate = 0;
const cutensorWorksizePreference_t workspacePref = CUTENSOR_WORKSPACE_DEFAULT;
HANDLE_ERROR(cutensorEstimateWorkspaceSize(handle,
desc,
planPref,
workspacePref,
&workspaceSizeEstimate));
return 0;
}
请注意,在此示例中我们未使用即时编译(CUTENSOR_JIT_MODE_NONE
),如果您想了解有关 cuTENSOR 的 JIT 功能的更多信息,请参阅 即时 (JIT) 编译。
规划和减少工作空间¶
有了工作空间大小的估计,我们现在可以继续创建实际的计划;此步骤依赖于 cuTENSOR 的启发式方法来选择最合适的算法和内核
cutensorPlan_t plan;
HANDLE_ERROR(cutensorCreatePlan(handle,
&plan,
desc,
planPref,
workspaceSizeEstimate));
创建计划并选择内核后,我们可以(可选地)查询计划实际需要的工作空间
uint64_t actualWorkspaceSize = 0;
HANDLE_ERROR(cutensorPlanGetAttribute(handle,
plan,
CUTENSOR_PLAN_REQUIRED_WORKSPACE,
&actualWorkspaceSize,
sizeof(actualWorkspaceSize)));
这样我们的代码就变成了
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cutensor.h>
#include <unordered_map>
#include <vector>
// Handle cuTENSOR errors
#define HANDLE_ERROR(x) \
{ const auto err = x; \
if( err != CUTENSOR_STATUS_SUCCESS ) \
{ printf("Error: %s\n", cutensorGetErrorString(err)); exit(-1); } \
};
#define HANDLE_CUDA_ERROR(x) \
{ const auto err = x; \
if( err != cudaSuccess ) \
{ printf("Error: %s\n", cudaGetErrorString(err)); exit(-1); } \
};
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// CUDA types
cutensorDataType_t typeA = CUTENSOR_R_32F;
cutensorDataType_t typeB = CUTENSOR_R_32F;
cutensorDataType_t typeC = CUTENSOR_R_32F;
cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
// Create vector of modes
std::vector<int> modeC{'m','u','n','v'};
std::vector<int> modeA{'m','h','k','n'};
std::vector<int> modeB{'u','k','v','h'};
int nmodeA = modeA.size();
int nmodeB = modeB.size();
int nmodeC = modeC.size();
// Extents
std::unordered_map<int, int64_t> extent;
extent['m'] = 96;
extent['n'] = 96;
extent['u'] = 96;
extent['v'] = 64;
extent['h'] = 64;
extent['k'] = 64;
// Create a vector of extents for each tensor
std::vector<int64_t> extentC;
for(auto mode : modeC)
extentC.push_back(extent[mode]);
std::vector<int64_t> extentA;
for(auto mode : modeA)
extentA.push_back(extent[mode]);
std::vector<int64_t> extentB;
for(auto mode : modeB)
extentB.push_back(extent[mode]);
printf("Define modes and extents\n");
/* ***************************** */
// Number of elements of each tensor
size_t elementsA = 1;
for(auto mode : modeA)
elementsA *= extent[mode];
size_t elementsB = 1;
for(auto mode : modeB)
elementsB *= extent[mode];
size_t elementsC = 1;
for(auto mode : modeC)
elementsC *= extent[mode];
// Size in bytes
size_t sizeA = sizeof(floatTypeA) * elementsA;
size_t sizeB = sizeof(floatTypeB) * elementsB;
size_t sizeC = sizeof(floatTypeC) * elementsC;
// Allocate on device
void *A_d, *B_d, *C_d;
cudaMalloc((void**)&A_d, sizeA);
cudaMalloc((void**)&B_d, sizeB);
cudaMalloc((void**)&C_d, sizeC);
// Allocate on host
floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);
// Initialize data on host
for(int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for(int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for(int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
// Copy to device
HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));
const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes)
assert(uintptr_t(A_d) % kAlignment == 0);
assert(uintptr_t(B_d) % kAlignment == 0);
assert(uintptr_t(C_d) % kAlignment == 0);
printf("Allocate, initialize and transfer tensors\n");
/*************************
* cuTENSOR
*************************/
cutensorHandle_t handle;
HANDLE_ERROR(cutensorCreate(&handle));
/**********************
* Create Tensor Descriptors
**********************/
cutensorTensorDescriptor_t descA;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descA,
nmodeA,
extentA.data(),
NULL,/*stride*/
typeA, kAlignment));
cutensorTensorDescriptor_t descB;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descB,
nmodeB,
extentB.data(),
NULL,/*stride*/
typeB, kAlignment));
cutensorTensorDescriptor_t descC;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descC,
nmodeC,
extentC.data(),
NULL,/*stride*/
typeC, kAlignment));
printf("Initialize cuTENSOR and tensor descriptors\n");
/*******************************
* Create Contraction Descriptor
*******************************/
cutensorOperationDescriptor_t desc;
HANDLE_ERROR(cutensorCreateContraction(handle,
&desc,
descA, modeA.data(), /* unary operator A*/CUTENSOR_OP_IDENTITY,
descB, modeB.data(), /* unary operator B*/CUTENSOR_OP_IDENTITY,
descC, modeC.data(), /* unary operator C*/CUTENSOR_OP_IDENTITY,
descC, modeC.data(),
descCompute));
/*****************************
* Optional (but recommended): ensure that the scalar type is correct.
*****************************/
cutensorDataType_t scalarType;
HANDLE_ERROR(cutensorOperationDescriptorGetAttribute(handle,
desc,
CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
(void*)&scalarType,
sizeof(scalarType)));
assert(scalarType == CUTENSOR_R_32F);
typedef float floatTypeCompute;
floatTypeCompute alpha = (floatTypeCompute)1.1f;
floatTypeCompute beta = (floatTypeCompute)0.f;
/**************************
* Set the algorithm to use
***************************/
const cutensorAlgo_t algo = CUTENSOR_ALGO_DEFAULT;
cutensorPlanPreference_t planPref;
HANDLE_ERROR(cutensorCreatePlanPreference(
handle,
&planPref,
algo,
CUTENSOR_JIT_MODE_NONE));
/**********************
* Query workspace estimate
**********************/
uint64_t workspaceSizeEstimate = 0;
const cutensorWorksizePreference_t workspacePref = CUTENSOR_WORKSPACE_DEFAULT;
HANDLE_ERROR(cutensorEstimateWorkspaceSize(handle,
desc,
planPref,
workspacePref,
&workspaceSizeEstimate));
/**************************
* Create Contraction Plan
**************************/
cutensorPlan_t plan;
HANDLE_ERROR(cutensorCreatePlan(handle,
&plan,
desc,
planPref,
workspaceSizeEstimate));
/**************************
* Optional: Query information about the created plan
**************************/
// query actually used workspace
uint64_t actualWorkspaceSize = 0;
HANDLE_ERROR(cutensorPlanGetAttribute(handle,
plan,
CUTENSOR_PLAN_REQUIRED_WORKSPACE,
&actualWorkspaceSize,
sizeof(actualWorkspaceSize)));
// At this point the user knows exactly how much memory is need by the operation and
// only the smaller actual workspace needs to be allocated
assert(actualWorkspaceSize <= workspaceSizeEstimate);
void *work = nullptr;
if (actualWorkspaceSize > 0)
{
HANDLE_CUDA_ERROR(cudaMalloc(&work, actualWorkspaceSize));
assert(uintptr_t(work) % 128 == 0); // workspace must be aligned to 128 byte-boundary
}
return 0;
}
执行¶
最后,我们准备好执行张量缩并并销毁(释放)所有已分配的资源
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cutensor.h>
#include <unordered_map>
#include <vector>
// Handle cuTENSOR errors
#define HANDLE_ERROR(x) \
{ const auto err = x; \
if( err != CUTENSOR_STATUS_SUCCESS ) \
{ printf("Error: %s\n", cutensorGetErrorString(err)); exit(-1); } \
};
#define HANDLE_CUDA_ERROR(x) \
{ const auto err = x; \
if( err != cudaSuccess ) \
{ printf("Error: %s\n", cudaGetErrorString(err)); exit(-1); } \
};
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// CUDA types
cutensorDataType_t typeA = CUTENSOR_R_32F;
cutensorDataType_t typeB = CUTENSOR_R_32F;
cutensorDataType_t typeC = CUTENSOR_R_32F;
cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
// Create vector of modes
std::vector<int> modeC{'m','u','n','v'};
std::vector<int> modeA{'m','h','k','n'};
std::vector<int> modeB{'u','k','v','h'};
int nmodeA = modeA.size();
int nmodeB = modeB.size();
int nmodeC = modeC.size();
// Extents
std::unordered_map<int, int64_t> extent;
extent['m'] = 96;
extent['n'] = 96;
extent['u'] = 96;
extent['v'] = 64;
extent['h'] = 64;
extent['k'] = 64;
// Create a vector of extents for each tensor
std::vector<int64_t> extentC;
for(auto mode : modeC)
extentC.push_back(extent[mode]);
std::vector<int64_t> extentA;
for(auto mode : modeA)
extentA.push_back(extent[mode]);
std::vector<int64_t> extentB;
for(auto mode : modeB)
extentB.push_back(extent[mode]);
printf("Define modes and extents\n");
/* ***************************** */
// Number of elements of each tensor
size_t elementsA = 1;
for(auto mode : modeA)
elementsA *= extent[mode];
size_t elementsB = 1;
for(auto mode : modeB)
elementsB *= extent[mode];
size_t elementsC = 1;
for(auto mode : modeC)
elementsC *= extent[mode];
// Size in bytes
size_t sizeA = sizeof(floatTypeA) * elementsA;
size_t sizeB = sizeof(floatTypeB) * elementsB;
size_t sizeC = sizeof(floatTypeC) * elementsC;
// Allocate on device
void *A_d, *B_d, *C_d;
cudaMalloc((void**)&A_d, sizeA);
cudaMalloc((void**)&B_d, sizeB);
cudaMalloc((void**)&C_d, sizeC);
// Allocate on host
floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);
// Initialize data on host
for(int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for(int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for(int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
// Copy to device
HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));
const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes)
assert(uintptr_t(A_d) % kAlignment == 0);
assert(uintptr_t(B_d) % kAlignment == 0);
assert(uintptr_t(C_d) % kAlignment == 0);
printf("Allocate, initialize and transfer tensors\n");
/*************************
* cuTENSOR
*************************/
cutensorHandle_t handle;
HANDLE_ERROR(cutensorCreate(&handle));
/**********************
* Create Tensor Descriptors
**********************/
cutensorTensorDescriptor_t descA;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descA,
nmodeA,
extentA.data(),
NULL,/*stride*/
typeA, kAlignment));
cutensorTensorDescriptor_t descB;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descB,
nmodeB,
extentB.data(),
NULL,/*stride*/
typeB, kAlignment));
cutensorTensorDescriptor_t descC;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descC,
nmodeC,
extentC.data(),
NULL,/*stride*/
typeC, kAlignment));
printf("Initialize cuTENSOR and tensor descriptors\n");
/*******************************
* Create Contraction Descriptor
*******************************/
cutensorOperationDescriptor_t desc;
HANDLE_ERROR(cutensorCreateContraction(handle,
&desc,
descA, modeA.data(), /* unary operator A*/CUTENSOR_OP_IDENTITY,
descB, modeB.data(), /* unary operator B*/CUTENSOR_OP_IDENTITY,
descC, modeC.data(), /* unary operator C*/CUTENSOR_OP_IDENTITY,
descC, modeC.data(),
descCompute));
/*****************************
* Optional (but recommended): ensure that the scalar type is correct.
*****************************/
cutensorDataType_t scalarType;
HANDLE_ERROR(cutensorOperationDescriptorGetAttribute(handle,
desc,
CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
(void*)&scalarType,
sizeof(scalarType)));
assert(scalarType == CUTENSOR_R_32F);
typedef float floatTypeCompute;
floatTypeCompute alpha = (floatTypeCompute)1.1f;
floatTypeCompute beta = (floatTypeCompute)0.f;
/**************************
* Set the algorithm to use
***************************/
const cutensorAlgo_t algo = CUTENSOR_ALGO_DEFAULT;
cutensorPlanPreference_t planPref;
HANDLE_ERROR(cutensorCreatePlanPreference(
handle,
&planPref,
algo,
CUTENSOR_JIT_MODE_NONE));
/**********************
* Query workspace estimate
**********************/
uint64_t workspaceSizeEstimate = 0;
const cutensorWorksizePreference_t workspacePref = CUTENSOR_WORKSPACE_DEFAULT;
HANDLE_ERROR(cutensorEstimateWorkspaceSize(handle,
desc,
planPref,
workspacePref,
&workspaceSizeEstimate));
/**************************
* Create Contraction Plan
**************************/
cutensorPlan_t plan;
HANDLE_ERROR(cutensorCreatePlan(handle,
&plan,
desc,
planPref,
workspaceSizeEstimate));
/**************************
* Optional: Query information about the created plan
**************************/
// query actually used workspace
uint64_t actualWorkspaceSize = 0;
HANDLE_ERROR(cutensorPlanGetAttribute(handle,
plan,
CUTENSOR_PLAN_REQUIRED_WORKSPACE,
&actualWorkspaceSize,
sizeof(actualWorkspaceSize)));
// At this point the user knows exactly how much memory is need by the operation and
// only the smaller actual workspace needs to be allocated
assert(actualWorkspaceSize <= workspaceSizeEstimate);
void *work = nullptr;
if (actualWorkspaceSize > 0)
{
HANDLE_CUDA_ERROR(cudaMalloc(&work, actualWorkspaceSize));
assert(uintptr_t(work) % 128 == 0); // workspace must be aligned to 128 byte-boundary
}
/**********************
* Execute
**********************/
cudaStream_t stream;
HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
HANDLE_ERROR(cutensorContract(handle,
plan,
(void*) &alpha, A_d, B_d,
(void*) &beta, C_d, C_d,
work, actualWorkspaceSize, stream));
/**********************
* Free allocated data
**********************/
HANDLE_ERROR(cutensorDestroy(handle));
HANDLE_ERROR(cutensorDestroyPlan(plan));
HANDLE_ERROR(cutensorDestroyOperationDescriptor(desc));
HANDLE_ERROR(cutensorDestroyTensorDescriptor(descA));
HANDLE_ERROR(cutensorDestroyTensorDescriptor(descB));
HANDLE_ERROR(cutensorDestroyTensorDescriptor(descC));
HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
if (A) free(A);
if (B) free(B);
if (C) free(C);
if (A_d) cudaFree(A_d);
if (B_d) cudaFree(B_d);
if (C_d) cudaFree(C_d);
if (work) cudaFree(work);
return 0;
}
就是这样。我们已经通过 cuTENSOR 执行了我们的第一个缩并!您可以在 samples repository 中找到此示例和其他示例。