入门指南#
在本节中,我们将展示如何使用 nvplTENSOR 实现第一个张量缩并。我们的代码将使用单精度算术计算以下操作。
我们逐步构建代码,每个步骤都在末尾添加代码。步骤之间用多个星号组成的注释分隔。
标题和数据类型#
首先,我们从一个简单的 main()
函数开始,包含一些头文件,并定义一些数据类型。
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <nvpl_tensor.h>
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// nvplTENSOR types
nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
return 0;
}
定义张量大小#
接下来,我们定义张量的模式和范围。为了示例的目的,我们假设模式 \(m\)、\(n\) 和 \(u\) 的范围为 96;\(v\)、\(h\) 和 \(k\) 的范围为 64。请注意模式是如何用整数标记的,以及这如何允许我们使用字符标记模式。有关术语模式和范围的解释,请参见 命名法。
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <nvpl_tensor.h>
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// nvplTENSOR types
nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
int32_t modeC[] = {0,2,1,3};
int32_t modeA[] = {0,4,5,1};
int32_t modeB[] = {2,5,3,4};
const int nmodeA = 4;
const int nmodeB = 4;
const int nmodeC = 4;
int64_t extent[] = {6, 6, 6, 4, 4, 4};
int64_t extentC[nmodeC];
for (int i = 0; i < nmodeC; ++i)
{
extentC[i] = extent[modeC[i]];
}
int64_t extentA[nmodeA];
for (int i = 0; i < nmodeA; ++i)
{
extentA[i] = extent[modeA[i]];
}
int64_t extentB[nmodeB];
for (int i = 0; i < nmodeB; ++i)
{
extentB[i] = extent[modeB[i]];
}
printf("Define modes and extents\n");
return 0;
}
初始化张量数据#
接下来,我们需要为我们的张量分配和初始化主机和设备内存
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <nvpl_tensor.h>
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// nvplTENSOR types
nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
int32_t modeC[] = {0, 2, 1, 3};
int32_t modeA[] = {0, 4, 5, 1};
int32_t modeB[] = {2, 5, 3, 4};
int const nmodeA = 4;
int const nmodeB = 4;
int const nmodeC = 4;
int64_t extent[] = {6, 6, 6, 4, 4, 4};
int64_t extentC[nmodeC];
for (int i = 0; i < nmodeC; ++i)
{
extentC[i] = extent[modeC[i]];
}
int64_t extentA[nmodeA];
for (int i = 0; i < nmodeA; ++i)
{
extentA[i] = extent[modeA[i]];
}
int64_t extentB[nmodeB];
for (int i = 0; i < nmodeB; ++i)
{
extentB[i] = extent[modeB[i]];
}
printf("Define modes and extents\n");
/* ***************************** */
// Number of elements of each tensor
int64_t elementsA = 1;
for (int i = 0; i < nmodeA; ++i)
{
elementsA *= extent[i];
}
int64_t elementsB = 1;
for (int i = 0; i < nmodeB; ++i)
{
elementsB *= extent[i];
}
int64_t elementsC = 1;
for (int i = 0; i < nmodeC; ++i)
{
elementsC *= extent[i];
}
// Size in bytes
int64_t sizeA = sizeof(floatTypeA) * elementsA;
int64_t sizeB = sizeof(floatTypeB) * elementsB;
int64_t sizeC = sizeof(floatTypeC) * elementsC;
uint32_t const kAlignment = 128; // Alignment of the pointers (bytes)
// Allocate
floatTypeA* A = aligned_alloc(kAlignment, sizeA);
floatTypeB* B = aligned_alloc(kAlignment, sizeB);
floatTypeC* C = aligned_alloc(kAlignment, sizeC);
if (A == NULL || B == NULL || C == NULL)
{
printf("Error: allocation of tensor memory.\n");
return -1;
}
// Initialize data
for (int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
for (int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
for (int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
assert((uintptr_t)A % kAlignment == 0);
assert((uintptr_t)B % kAlignment == 0);
assert((uintptr_t)C % kAlignment == 0);
printf("Allocate and initialize tensors\n");
/**********************
* Free allocated data
**********************/
if (A) free(A);
if (B) free(B);
if (C) free(C);
return 0;
}
创建张量描述符#
现在我们准备好使用 nvplTENSOR 库并初始化其库句柄。然后,我们通过提供每个张量的数据类型、阶数、数据类型和对齐方式(为此创建张量描述符的数据指针的对齐方式,以字节为单位)来创建描述符。
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <nvpl_tensor.h>
// Handle nvplTENSOR errors
#define HANDLE_ERROR(x) \
{ \
const nvpltensorStatus_t err = x; \
if (err != NVPLTENSOR_STATUS_SUCCESS) \
{ \
printf("Error: %s\n", nvpltensorGetErrorString(err)); \
exit(-1); \
} \
};
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// nvplTENSOR types
nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
int32_t modeC[] = {0, 2, 1, 3};
int32_t modeA[] = {0, 4, 5, 1};
int32_t modeB[] = {2, 5, 3, 4};
int const nmodeA = 4;
int const nmodeB = 4;
int const nmodeC = 4;
int64_t extent[] = {6, 6, 6, 4, 4, 4};
int64_t extentC[nmodeC];
for (int i = 0; i < nmodeC; ++i)
{
extentC[i] = extent[modeC[i]];
}
int64_t extentA[nmodeA];
for (int i = 0; i < nmodeA; ++i)
{
extentA[i] = extent[modeA[i]];
}
int64_t extentB[nmodeB];
for (int i = 0; i < nmodeB; ++i)
{
extentB[i] = extent[modeB[i]];
}
printf("Define modes and extents\n");
/* ***************************** */
// Number of elements of each tensor
int64_t elementsA = 1;
for (int i = 0; i < nmodeA; ++i)
{
elementsA *= extent[i];
}
int64_t elementsB = 1;
for (int i = 0; i < nmodeB; ++i)
{
elementsB *= extent[i];
}
int64_t elementsC = 1;
for (int i = 0; i < nmodeC; ++i)
{
elementsC *= extent[i];
}
// Size in bytes
int64_t sizeA = sizeof(floatTypeA) * elementsA;
int64_t sizeB = sizeof(floatTypeB) * elementsB;
int64_t sizeC = sizeof(floatTypeC) * elementsC;
uint32_t const kAlignment = 128; // Alignment of the pointers (bytes)
// Allocate
floatTypeA* A = aligned_alloc(kAlignment, sizeA);
floatTypeB* B = aligned_alloc(kAlignment, sizeB);
floatTypeC* C = aligned_alloc(kAlignment, sizeC);
if (A == NULL || B == NULL || C == NULL)
{
printf("Error: allocation of tensor memory.\n");
return -1;
}
// Initialize data
for (int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
for (int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
for (int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
assert((uintptr_t)A % kAlignment == 0);
assert((uintptr_t)B % kAlignment == 0);
assert((uintptr_t)C % kAlignment == 0);
printf("Allocate and initialize\n");
/*************************
* nvplTENSOR
*************************/
nvpltensorHandle_t handle;
HANDLE_ERROR(nvpltensorCreate(&handle));
/**********************
* Set number of threads, that nvplTensor can use
**********************/
uint32_t const numThreads = 4;
HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));
/**********************
* Create Tensor Descriptors
**********************/
nvpltensorTensorDescriptor_t descA;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
nmodeA, extentA, NULL /*stride*/,
typeA, kAlignment));
nvpltensorTensorDescriptor_t descB;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
nmodeB, extentB, NULL /*stride*/,
typeB, kAlignment));
nvpltensorTensorDescriptor_t descC;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
nmodeC, extentC, NULL /*stride*/,
typeC, kAlignment));
printf("Initialize nvplTENSOR and tensor descriptors\n");
/**********************
* Free allocated data
**********************/
HANDLE_ERROR(nvpltensorDestroy(handle));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));
if (A) free(A);
if (B) free(B);
if (C) free(C);
return 0;
}
创建缩并描述符#
在此步骤中,我们创建编码缩并的操作描述符,并确保标量类型正确
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <nvpl_tensor.h>
// Handle nvplTENSOR errors
#define HANDLE_ERROR(x) \
{ \
const nvpltensorStatus_t err = x; \
if (err != NVPLTENSOR_STATUS_SUCCESS) \
{ \
printf("Error: %s\n", nvpltensorGetErrorString(err)); \
exit(-1); \
} \
};
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// nvplTENSOR types
nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
int32_t modeC[] = {0, 2, 1, 3};
int32_t modeA[] = {0, 4, 5, 1};
int32_t modeB[] = {2, 5, 3, 4};
int const nmodeA = 4;
int const nmodeB = 4;
int const nmodeC = 4;
int64_t extent[] = {6, 6, 6, 4, 4, 4};
int64_t extentC[nmodeC];
for (int i = 0; i < nmodeC; ++i)
{
extentC[i] = extent[modeC[i]];
}
int64_t extentA[nmodeA];
for (int i = 0; i < nmodeA; ++i)
{
extentA[i] = extent[modeA[i]];
}
int64_t extentB[nmodeB];
for (int i = 0; i < nmodeB; ++i)
{
extentB[i] = extent[modeB[i]];
}
printf("Define modes and extents\n");
/* ***************************** */
// Number of elements of each tensor
int64_t elementsA = 1;
for (int i = 0; i < nmodeA; ++i)
{
elementsA *= extent[i];
}
int64_t elementsB = 1;
for (int i = 0; i < nmodeB; ++i)
{
elementsB *= extent[i];
}
int64_t elementsC = 1;
for (int i = 0; i < nmodeC; ++i)
{
elementsC *= extent[i];
}
// Size in bytes
int64_t sizeA = sizeof(floatTypeA) * elementsA;
int64_t sizeB = sizeof(floatTypeB) * elementsB;
int64_t sizeC = sizeof(floatTypeC) * elementsC;
uint32_t const kAlignment = 128; // Alignment of the pointers (bytes)
// Allocate
floatTypeA* A = aligned_alloc(kAlignment, sizeA);
floatTypeB* B = aligned_alloc(kAlignment, sizeB);
floatTypeC* C = aligned_alloc(kAlignment, sizeC);
if (A == NULL || B == NULL || C == NULL)
{
printf("Error: allocation of tensor memory.\n");
return -1;
}
// Initialize data
for (int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
for (int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
for (int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
assert((uintptr_t)A % kAlignment == 0);
assert((uintptr_t)B % kAlignment == 0);
assert((uintptr_t)C % kAlignment == 0);
printf("Allocate and initialize\n");
/*************************
* nvplTENSOR
*************************/
nvpltensorHandle_t handle;
HANDLE_ERROR(nvpltensorCreate(&handle));
/**********************
* Set number of threads, that nvplTensor can use
**********************/
uint32_t const numThreads = 4;
HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));
/**********************
* Create Tensor Descriptors
**********************/
nvpltensorTensorDescriptor_t descA;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
nmodeA, extentA, NULL /*stride*/,
typeA, kAlignment));
nvpltensorTensorDescriptor_t descB;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
nmodeB, extentB, NULL /*stride*/,
typeB, kAlignment));
nvpltensorTensorDescriptor_t descC;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
nmodeC, extentC, NULL /*stride*/,
typeC, kAlignment));
printf("Initialize nvplTENSOR and tensor descriptors\n");
/*******************************
* Create Contraction Descriptor
*******************************/
nvpltensorOperationDescriptor_t desc;
HANDLE_ERROR(nvpltensorCreateContraction(handle, &desc,
descA, modeA, /* unary operator A*/ NVPLTENSOR_OP_IDENTITY,
descB, modeB, /* unary operator B*/ NVPLTENSOR_OP_IDENTITY,
descC, modeC, /* unary operator C*/ NVPLTENSOR_OP_IDENTITY,
descC, modeC, descCompute));
printf("Initialize operation descriptor\n");
/*****************************
* Optional (but recommended): ensure that the scalar type is correct.
*****************************/
nvpltensorDataType_t scalarType;
HANDLE_ERROR(nvpltensorOperationDescriptorGetAttribute(handle, desc,
NVPLTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
(void*) &scalarType, sizeof(scalarType)));
assert(scalarType == NVPLTENSOR_R_32F);
typedef float floatTypeCompute;
floatTypeCompute alpha = (floatTypeCompute) 1.1f;
floatTypeCompute beta = (floatTypeCompute) 0.f;
printf("Check scalar type required for operation\n");
/**********************
* Free allocated data
**********************/
HANDLE_ERROR(nvpltensorDestroy(handle));
HANDLE_ERROR(nvpltensorDestroyOperationDescriptor(desc));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));
if (A) free(A);
if (B) free(B);
if (C) free(C);
return 0;
}
确定算法和工作区#
现在我们已经定义了张量和我们想要执行的缩并,我们必须选择一种算法来执行缩并。该算法由 nvpltensorAlgo_t 指定。指定 NVPLTENSOR_ALGO_DEFAULT
允许我们让 nvplTENSOR 的内部启发式方法选择最佳方法。查找良好算法的所有信息都存储在 nvpltensorPlanPreference_t 数据结构中。我们还可以查询库以估计所提供的操作描述符的工作区需求量;用户可以在不同的 nvpltensorWorksizePreference_t 之间进行选择。对于此示例,我们使用 NVPLTENSOR_WORKSPACE_DEFAULT
,这是一个很好的默认选择,旨在实现高性能,同时减少工作区需求。虽然工作区内存不是严格必需的,但强烈建议使用它以获得高性能。
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <nvpl_tensor.h>
// Handle nvplTENSOR errors
#define HANDLE_ERROR(x) \
{ \
const nvpltensorStatus_t err = x; \
if (err != NVPLTENSOR_STATUS_SUCCESS) \
{ \
printf("Error: %s\n", nvpltensorGetErrorString(err)); \
exit(-1); \
} \
};
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// nvplTENSOR types
nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
int32_t modeC[] = {0, 2, 1, 3};
int32_t modeA[] = {0, 4, 5, 1};
int32_t modeB[] = {2, 5, 3, 4};
int const nmodeA = 4;
int const nmodeB = 4;
int const nmodeC = 4;
int64_t extent[] = {6, 6, 6, 4, 4, 4};
int64_t extentC[nmodeC];
for (int i = 0; i < nmodeC; ++i)
{
extentC[i] = extent[modeC[i]];
}
int64_t extentA[nmodeA];
for (int i = 0; i < nmodeA; ++i)
{
extentA[i] = extent[modeA[i]];
}
int64_t extentB[nmodeB];
for (int i = 0; i < nmodeB; ++i)
{
extentB[i] = extent[modeB[i]];
}
printf("Define modes and extents\n");
/* ***************************** */
// Number of elements of each tensor
int64_t elementsA = 1;
for (int i = 0; i < nmodeA; ++i)
{
elementsA *= extent[i];
}
int64_t elementsB = 1;
for (int i = 0; i < nmodeB; ++i)
{
elementsB *= extent[i];
}
int64_t elementsC = 1;
for (int i = 0; i < nmodeC; ++i)
{
elementsC *= extent[i];
}
// Size in bytes
int64_t sizeA = sizeof(floatTypeA) * elementsA;
int64_t sizeB = sizeof(floatTypeB) * elementsB;
int64_t sizeC = sizeof(floatTypeC) * elementsC;
uint32_t const kAlignment = 128; // Alignment of the pointers (bytes)
// Allocate
floatTypeA* A = aligned_alloc(kAlignment, sizeA);
floatTypeB* B = aligned_alloc(kAlignment, sizeB);
floatTypeC* C = aligned_alloc(kAlignment, sizeC);
if (A == NULL || B == NULL || C == NULL)
{
printf("Error: allocation of tensor memory.\n");
return -1;
}
// Initialize data
for (int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
for (int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
for (int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
assert((uintptr_t)A % kAlignment == 0);
assert((uintptr_t)B % kAlignment == 0);
assert((uintptr_t)C % kAlignment == 0);
printf("Allocate and initialize\n");
/*************************
* nvplTENSOR
*************************/
nvpltensorHandle_t handle;
HANDLE_ERROR(nvpltensorCreate(&handle));
/**********************
* Set number of threads, that nvplTensor can use
**********************/
uint32_t const numThreads = 4;
HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));
/**********************
* Create Tensor Descriptors
**********************/
nvpltensorTensorDescriptor_t descA;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
nmodeA, extentA, NULL /*stride*/,
typeA, kAlignment));
nvpltensorTensorDescriptor_t descB;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
nmodeB, extentB, NULL /*stride*/,
typeB, kAlignment));
nvpltensorTensorDescriptor_t descC;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
nmodeC, extentC, NULL /*stride*/,
typeC, kAlignment));
printf("Initialize nvplTENSOR and tensor descriptors\n");
/*******************************
* Create Contraction Descriptor
*******************************/
nvpltensorOperationDescriptor_t desc;
HANDLE_ERROR(nvpltensorCreateContraction(handle, &desc,
descA, modeA, /* unary operator A*/ NVPLTENSOR_OP_IDENTITY,
descB, modeB, /* unary operator B*/ NVPLTENSOR_OP_IDENTITY,
descC, modeC, /* unary operator C*/ NVPLTENSOR_OP_IDENTITY,
descC, modeC, descCompute));
printf("Initialize operation descriptor\n");
/*****************************
* Optional (but recommended): ensure that the scalar type is correct.
*****************************/
nvpltensorDataType_t scalarType;
HANDLE_ERROR(nvpltensorOperationDescriptorGetAttribute(handle, desc,
NVPLTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
(void*) &scalarType, sizeof(scalarType)));
assert(scalarType == NVPLTENSOR_R_32F);
typedef float floatTypeCompute;
floatTypeCompute alpha = (floatTypeCompute) 1.1f;
floatTypeCompute beta = (floatTypeCompute) 0.f;
printf("Check scalar type required for operation\n");
/**************************
* Set the algorithm to use
***************************/
nvpltensorAlgo_t const algo = NVPLTENSOR_ALGO_DEFAULT;
nvpltensorPlanPreference_t planPref;
HANDLE_ERROR(nvpltensorCreatePlanPreference(handle, &planPref,
algo, NVPLTENSOR_JIT_MODE_NONE));
printf("Initialize plan preference\n");
/**********************
* Query workspace estimate
**********************/
uint64_t workspaceSizeEstimate = 0;
nvpltensorWorksizePreference_t const workspacePref = NVPLTENSOR_WORKSPACE_DEFAULT;
HANDLE_ERROR(nvpltensorEstimateWorkspaceSize(handle, desc,
planPref, workspacePref, &workspaceSizeEstimate));
printf("Estimate workspace required for operation\n");
/**********************
* Free allocated data
**********************/
HANDLE_ERROR(nvpltensorDestroy(handle));
HANDLE_ERROR(nvpltensorDestroyPlanPreference(planPref));
HANDLE_ERROR(nvpltensorDestroyOperationDescriptor(desc));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));
if (A) free(A);
if (B) free(B);
if (C) free(C);
return 0;
}
请注意,在此示例中我们未使用即时编译 (NVPLTENSOR_JIT_MODE_NONE
),如果您想了解有关 nvplTENSOR 的 JIT 功能的更多信息,请参阅 jit-compilation-label。
规划和减少工作区#
有了工作区大小的估计,我们现在可以继续创建实际的计划;此步骤依赖于 nvplTENSOR 的启发式方法来选择最合适的算法和内核
nvpltensorPlan_t plan;
HANDLE_ERROR(nvpltensorCreatePlan(handle,
&plan,
desc,
planPref,
workspaceSizeEstimate));
一旦创建了计划并选择了内核,我们可以(可选地)查询计划实际需要的工作区
uint64_t actualWorkspaceSize = 0;
HANDLE_ERROR(nvpltensorPlanGetAttribute(handle,
plan,
NVPLTENSOR_PLAN_REQUIRED_WORKSPACE,
&actualWorkspaceSize,
sizeof(actualWorkspaceSize)));
这样我们的代码就变成了
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <nvpl_tensor.h>
// Handle nvplTENSOR errors
#define HANDLE_ERROR(x) \
{ \
const nvpltensorStatus_t err = x; \
if (err != NVPLTENSOR_STATUS_SUCCESS) \
{ \
printf("Error: %s\n", nvpltensorGetErrorString(err)); \
exit(-1); \
} \
};
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// nvplTENSOR types
nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
int32_t modeC[] = {0, 2, 1, 3};
int32_t modeA[] = {0, 4, 5, 1};
int32_t modeB[] = {2, 5, 3, 4};
int const nmodeA = 4;
int const nmodeB = 4;
int const nmodeC = 4;
int64_t extent[] = {6, 6, 6, 4, 4, 4};
int64_t extentC[nmodeC];
for (int i = 0; i < nmodeC; ++i)
{
extentC[i] = extent[modeC[i]];
}
int64_t extentA[nmodeA];
for (int i = 0; i < nmodeA; ++i)
{
extentA[i] = extent[modeA[i]];
}
int64_t extentB[nmodeB];
for (int i = 0; i < nmodeB; ++i)
{
extentB[i] = extent[modeB[i]];
}
printf("Define modes and extents\n");
/* ***************************** */
// Number of elements of each tensor
int64_t elementsA = 1;
for (int i = 0; i < nmodeA; ++i)
{
elementsA *= extent[i];
}
int64_t elementsB = 1;
for (int i = 0; i < nmodeB; ++i)
{
elementsB *= extent[i];
}
int64_t elementsC = 1;
for (int i = 0; i < nmodeC; ++i)
{
elementsC *= extent[i];
}
// Size in bytes
int64_t sizeA = sizeof(floatTypeA) * elementsA;
int64_t sizeB = sizeof(floatTypeB) * elementsB;
int64_t sizeC = sizeof(floatTypeC) * elementsC;
uint32_t const kAlignment = 128; // Alignment of the pointers (bytes)
// Allocate
floatTypeA* A = aligned_alloc(kAlignment, sizeA);
floatTypeB* B = aligned_alloc(kAlignment, sizeB);
floatTypeC* C = aligned_alloc(kAlignment, sizeC);
if (A == NULL || B == NULL || C == NULL)
{
printf("Error: allocation of tensor memory.\n");
return -1;
}
// Initialize data
for (int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
for (int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
for (int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
assert((uintptr_t)A % kAlignment == 0);
assert((uintptr_t)B % kAlignment == 0);
assert((uintptr_t)C % kAlignment == 0);
printf("Allocate and initialize\n");
/*************************
* nvplTENSOR
*************************/
nvpltensorHandle_t handle;
HANDLE_ERROR(nvpltensorCreate(&handle));
/**********************
* Set number of threads, that nvplTensor can use
**********************/
uint32_t const numThreads = 4;
HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));
/**********************
* Create Tensor Descriptors
**********************/
nvpltensorTensorDescriptor_t descA;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
nmodeA, extentA, NULL /*stride*/,
typeA, kAlignment));
nvpltensorTensorDescriptor_t descB;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
nmodeB, extentB, NULL /*stride*/,
typeB, kAlignment));
nvpltensorTensorDescriptor_t descC;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
nmodeC, extentC, NULL /*stride*/,
typeC, kAlignment));
printf("Initialize nvplTENSOR and tensor descriptors\n");
/*******************************
* Create Contraction Descriptor
*******************************/
nvpltensorOperationDescriptor_t desc;
HANDLE_ERROR(nvpltensorCreateContraction(handle, &desc,
descA, modeA, /* unary operator A*/ NVPLTENSOR_OP_IDENTITY,
descB, modeB, /* unary operator B*/ NVPLTENSOR_OP_IDENTITY,
descC, modeC, /* unary operator C*/ NVPLTENSOR_OP_IDENTITY,
descC, modeC, descCompute));
printf("Initialize operation descriptor\n");
/*****************************
* Optional (but recommended): ensure that the scalar type is correct.
*****************************/
nvpltensorDataType_t scalarType;
HANDLE_ERROR(nvpltensorOperationDescriptorGetAttribute(handle, desc,
NVPLTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
(void*) &scalarType, sizeof(scalarType)));
assert(scalarType == NVPLTENSOR_R_32F);
typedef float floatTypeCompute;
floatTypeCompute alpha = (floatTypeCompute) 1.1f;
floatTypeCompute beta = (floatTypeCompute) 0.f;
printf("Check scalar type required for operation\n");
/**************************
* Set the algorithm to use
***************************/
nvpltensorAlgo_t const algo = NVPLTENSOR_ALGO_DEFAULT;
nvpltensorPlanPreference_t planPref;
HANDLE_ERROR(nvpltensorCreatePlanPreference(handle, &planPref,
algo, NVPLTENSOR_JIT_MODE_NONE));
printf("Initialize plan preference\n");
/**********************
* Query workspace estimate
**********************/
uint64_t workspaceSizeEstimate = 0;
nvpltensorWorksizePreference_t const workspacePref = NVPLTENSOR_WORKSPACE_DEFAULT;
HANDLE_ERROR(nvpltensorEstimateWorkspaceSize(handle, desc,
planPref, workspacePref, &workspaceSizeEstimate));
printf("Estimate workspace required for operation\n");
/**************************
* Create Contraction Plan
**************************/
nvpltensorPlan_t plan;
HANDLE_ERROR(nvpltensorCreatePlan(handle, &plan,
desc, planPref, workspaceSizeEstimate));
printf("Initialize plan\n");
/**************************
* Optional: Query information about the created plan
**************************/
// query actually used workspace
uint64_t actualWorkspaceSize = 0;
HANDLE_ERROR(nvpltensorPlanGetAttribute(handle, plan,
NVPLTENSOR_PLAN_REQUIRED_WORKSPACE,
&actualWorkspaceSize, sizeof(actualWorkspaceSize)));
// At this point the user knows exactly how much memory is need by the operation and
// only the smaller actual workspace needs to be allocated
assert(actualWorkspaceSize <= workspaceSizeEstimate);
actualWorkspaceSize += 256;
printf("Query information about the created plan\n");
void* work = NULL;
if (actualWorkspaceSize > 0)
{
work = aligned_alloc(kAlignment, actualWorkspaceSize);
}
/**********************
* Free allocated data
**********************/
HANDLE_ERROR(nvpltensorDestroy(handle));
HANDLE_ERROR(nvpltensorDestroyPlan(plan));
HANDLE_ERROR(nvpltensorDestroyPlanPreference(planPref));
HANDLE_ERROR(nvpltensorDestroyOperationDescriptor(desc));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));
if (A) free(A);
if (B) free(B);
if (C) free(C);
if (work) free(work);
return 0;
}
执行#
最后,我们准备好执行张量缩并并销毁(释放)所有已分配的资源
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <nvpl_tensor.h>
// Handle nvplTENSOR errors
#define HANDLE_ERROR(x) \
{ \
const nvpltensorStatus_t err = x; \
if (err != NVPLTENSOR_STATUS_SUCCESS) \
{ \
printf("Error: %s\n", nvpltensorGetErrorString(err)); \
exit(-1); \
} \
};
int main(int argc, char** argv)
{
// Host element type definition
typedef float floatTypeA;
typedef float floatTypeB;
typedef float floatTypeC;
typedef float floatTypeCompute;
// nvplTENSOR types
nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;
printf("Include headers and define data types\n");
/* ***************************** */
int32_t modeC[] = {0, 2, 1, 3};
int32_t modeA[] = {0, 4, 5, 1};
int32_t modeB[] = {2, 5, 3, 4};
int const nmodeA = 4;
int const nmodeB = 4;
int const nmodeC = 4;
int64_t extent[] = {6, 6, 6, 4, 4, 4};
int64_t extentC[nmodeC];
for (int i = 0; i < nmodeC; ++i)
{
extentC[i] = extent[modeC[i]];
}
int64_t extentA[nmodeA];
for (int i = 0; i < nmodeA; ++i)
{
extentA[i] = extent[modeA[i]];
}
int64_t extentB[nmodeB];
for (int i = 0; i < nmodeB; ++i)
{
extentB[i] = extent[modeB[i]];
}
printf("Define modes and extents\n");
/* ***************************** */
// Number of elements of each tensor
int64_t elementsA = 1;
for (int i = 0; i < nmodeA; ++i)
{
elementsA *= extent[i];
}
int64_t elementsB = 1;
for (int i = 0; i < nmodeB; ++i)
{
elementsB *= extent[i];
}
int64_t elementsC = 1;
for (int i = 0; i < nmodeC; ++i)
{
elementsC *= extent[i];
}
// Size in bytes
int64_t sizeA = sizeof(floatTypeA) * elementsA;
int64_t sizeB = sizeof(floatTypeB) * elementsB;
int64_t sizeC = sizeof(floatTypeC) * elementsC;
uint32_t const kAlignment = 128; // Alignment of the pointers (bytes)
// Allocate
floatTypeA* A = aligned_alloc(kAlignment, sizeA);
floatTypeB* B = aligned_alloc(kAlignment, sizeB);
floatTypeC* C = aligned_alloc(kAlignment, sizeC);
if (A == NULL || B == NULL || C == NULL)
{
printf("Error: allocation of tensor memory.\n");
return -1;
}
// Initialize data
for (int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
for (int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
for (int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
assert((uintptr_t)A % kAlignment == 0);
assert((uintptr_t)B % kAlignment == 0);
assert((uintptr_t)C % kAlignment == 0);
printf("Allocate and initialize\n");
/*************************
* nvplTENSOR
*************************/
nvpltensorHandle_t handle;
HANDLE_ERROR(nvpltensorCreate(&handle));
/**********************
* Set number of threads, that nvplTensor can use
**********************/
uint32_t const numThreads = 4;
HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));
/**********************
* Create Tensor Descriptors
**********************/
nvpltensorTensorDescriptor_t descA;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
nmodeA, extentA, NULL /*stride*/,
typeA, kAlignment));
nvpltensorTensorDescriptor_t descB;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
nmodeB, extentB, NULL /*stride*/,
typeB, kAlignment));
nvpltensorTensorDescriptor_t descC;
HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
nmodeC, extentC, NULL /*stride*/,
typeC, kAlignment));
printf("Initialize nvplTENSOR and tensor descriptors\n");
/*******************************
* Create Contraction Descriptor
*******************************/
nvpltensorOperationDescriptor_t desc;
HANDLE_ERROR(nvpltensorCreateContraction(handle, &desc,
descA, modeA, /* unary operator A*/ NVPLTENSOR_OP_IDENTITY,
descB, modeB, /* unary operator B*/ NVPLTENSOR_OP_IDENTITY,
descC, modeC, /* unary operator C*/ NVPLTENSOR_OP_IDENTITY,
descC, modeC, descCompute));
printf("Initialize operation descriptor\n");
/*****************************
* Optional (but recommended): ensure that the scalar type is correct.
*****************************/
nvpltensorDataType_t scalarType;
HANDLE_ERROR(nvpltensorOperationDescriptorGetAttribute(handle, desc,
NVPLTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
(void*) &scalarType, sizeof(scalarType)));
assert(scalarType == NVPLTENSOR_R_32F);
typedef float floatTypeCompute;
floatTypeCompute alpha = (floatTypeCompute) 1.1f;
floatTypeCompute beta = (floatTypeCompute) 0.f;
printf("Check scalar type required for operation\n");
/**************************
* Set the algorithm to use
***************************/
nvpltensorAlgo_t const algo = NVPLTENSOR_ALGO_DEFAULT;
nvpltensorPlanPreference_t planPref;
HANDLE_ERROR(nvpltensorCreatePlanPreference(handle, &planPref,
algo, NVPLTENSOR_JIT_MODE_NONE));
printf("Initialize plan preference\n");
/**********************
* Query workspace estimate
**********************/
uint64_t workspaceSizeEstimate = 0;
nvpltensorWorksizePreference_t const workspacePref = NVPLTENSOR_WORKSPACE_DEFAULT;
HANDLE_ERROR(nvpltensorEstimateWorkspaceSize(handle, desc,
planPref, workspacePref, &workspaceSizeEstimate));
printf("Estimate workspace required for operation\n");
/**************************
* Create Contraction Plan
**************************/
nvpltensorPlan_t plan;
HANDLE_ERROR(nvpltensorCreatePlan(handle, &plan,
desc, planPref, workspaceSizeEstimate));
printf("Initialize plan\n");
/**************************
* Optional: Query information about the created plan
**************************/
// query actually used workspace
uint64_t actualWorkspaceSize = 0;
HANDLE_ERROR(nvpltensorPlanGetAttribute(handle, plan,
NVPLTENSOR_PLAN_REQUIRED_WORKSPACE,
&actualWorkspaceSize, sizeof(actualWorkspaceSize)));
// At this point the user knows exactly how much memory is need by the operation and
// only the smaller actual workspace needs to be allocated
assert(actualWorkspaceSize <= workspaceSizeEstimate);
actualWorkspaceSize += 256;
printf("Query information about the created plan\n");
void* work = NULL;
if (actualWorkspaceSize > 0)
{
work = aligned_alloc(kAlignment, actualWorkspaceSize);
}
/**********************
* Execute
**********************/
HANDLE_ERROR(nvpltensorContract(handle, plan,
(void*) &alpha, A, B, (void*) &beta, C, C,
work, actualWorkspaceSize));
printf("Perform operation\n");
/**********************
* Free allocated data
**********************/
HANDLE_ERROR(nvpltensorDestroy(handle));
HANDLE_ERROR(nvpltensorDestroyPlan(plan));
HANDLE_ERROR(nvpltensorDestroyPlanPreference(planPref));
HANDLE_ERROR(nvpltensorDestroyOperationDescriptor(desc));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));
if (A) free(A);
if (B) free(B);
if (C) free(C);
if (work) free(work);
return 0;
}
就是这样。我们已经通过 nvplTENSOR 执行了我们的第一个缩并!您可以在 example 目录中找到此示例和其他示例。