入门指南#

在本节中,我们将展示如何使用 nvplTENSOR 实现第一个张量缩并。我们的代码将使用单精度算术计算以下操作。

\[C_{0,2,1,3} = \alpha A_{0,4,5,1} B_{2,5,3,4} + \beta C_{0,2,1,3}\]

我们逐步构建代码,每个步骤都在末尾添加代码。步骤之间用多个星号组成的注释分隔。

标题和数据类型#

首先,我们从一个简单的 main() 函数开始,包含一些头文件,并定义一些数据类型。

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <nvpl_tensor.h>

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    return 0;
}

定义张量大小#

接下来,我们定义张量的模式和范围。为了示例的目的,我们假设模式 \(m\)\(n\)\(u\) 的范围为 96;\(v\)\(h\)\(k\) 的范围为 64。请注意模式是如何用整数标记的,以及这如何允许我们使用字符标记模式。有关术语模式和范围的解释,请参见 命名法

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <nvpl_tensor.h>

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0,2,1,3};
    int32_t modeA[] = {0,4,5,1};
    int32_t modeB[] = {2,5,3,4};
    const int nmodeA = 4;
    const int nmodeB = 4;
    const int nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    return 0;
}

初始化张量数据#

接下来,我们需要为我们的张量分配和初始化主机和设备内存

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include <nvpl_tensor.h>

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0, 2, 1, 3};
    int32_t modeA[] = {0, 4, 5, 1};
    int32_t modeB[] = {2, 5, 3, 4};
    int const nmodeA = 4;
    int const nmodeB = 4;
    int const nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    /* ***************************** */

    // Number of elements of each tensor
    int64_t elementsA = 1;
    for (int i = 0; i < nmodeA; ++i)
    {
        elementsA *= extent[i];
    }
    int64_t elementsB = 1;
    for (int i = 0; i < nmodeB; ++i)
    {
        elementsB *= extent[i];
    }
    int64_t elementsC = 1;
    for (int i = 0; i < nmodeC; ++i)
    {
        elementsC *= extent[i];
    }

    // Size in bytes
    int64_t sizeA = sizeof(floatTypeA) * elementsA;
    int64_t sizeB = sizeof(floatTypeB) * elementsB;
    int64_t sizeC = sizeof(floatTypeC) * elementsC;

    uint32_t const kAlignment = 128;  // Alignment of the pointers (bytes)

    // Allocate
    floatTypeA* A = aligned_alloc(kAlignment, sizeA);
    floatTypeB* B = aligned_alloc(kAlignment, sizeB);
    floatTypeC* C = aligned_alloc(kAlignment, sizeC);

    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: allocation of tensor memory.\n");
        return -1;
    }

    // Initialize data
    for (int64_t i = 0; i < elementsA; i++)
        A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsB; i++)
        B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsC; i++)
        C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;

    assert((uintptr_t)A % kAlignment == 0);
    assert((uintptr_t)B % kAlignment == 0);
    assert((uintptr_t)C % kAlignment == 0);

    printf("Allocate and initialize tensors\n");

    /**********************
     * Free allocated data
     **********************/

    if (A) free(A);
    if (B) free(B);
    if (C) free(C);

    return 0;
}

创建张量描述符#

现在我们准备好使用 nvplTENSOR 库并初始化其库句柄。然后,我们通过提供每个张量的数据类型、阶数、数据类型和对齐方式(为此创建张量描述符的数据指针的对齐方式,以字节为单位)来创建描述符。

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include <nvpl_tensor.h>

// Handle nvplTENSOR errors
#define HANDLE_ERROR(x)                                           \
    {                                                             \
        const nvpltensorStatus_t err = x;                         \
        if (err != NVPLTENSOR_STATUS_SUCCESS)                     \
        {                                                         \
            printf("Error: %s\n", nvpltensorGetErrorString(err)); \
            exit(-1);                                             \
        }                                                         \
    };

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0, 2, 1, 3};
    int32_t modeA[] = {0, 4, 5, 1};
    int32_t modeB[] = {2, 5, 3, 4};
    int const nmodeA = 4;
    int const nmodeB = 4;
    int const nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    /* ***************************** */

    // Number of elements of each tensor
    int64_t elementsA = 1;
    for (int i = 0; i < nmodeA; ++i)
    {
        elementsA *= extent[i];
    }
    int64_t elementsB = 1;
    for (int i = 0; i < nmodeB; ++i)
    {
        elementsB *= extent[i];
    }
    int64_t elementsC = 1;
    for (int i = 0; i < nmodeC; ++i)
    {
        elementsC *= extent[i];
    }

    // Size in bytes
    int64_t sizeA = sizeof(floatTypeA) * elementsA;
    int64_t sizeB = sizeof(floatTypeB) * elementsB;
    int64_t sizeC = sizeof(floatTypeC) * elementsC;

    uint32_t const kAlignment = 128;  // Alignment of the pointers (bytes)

    // Allocate
    floatTypeA* A = aligned_alloc(kAlignment, sizeA);
    floatTypeB* B = aligned_alloc(kAlignment, sizeB);
    floatTypeC* C = aligned_alloc(kAlignment, sizeC);

    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: allocation of tensor memory.\n");
        return -1;
    }

    // Initialize data
    for (int64_t i = 0; i < elementsA; i++)
        A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsB; i++)
        B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsC; i++)
        C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;

    assert((uintptr_t)A % kAlignment == 0);
    assert((uintptr_t)B % kAlignment == 0);
    assert((uintptr_t)C % kAlignment == 0);

    printf("Allocate and initialize\n");

    /*************************
     * nvplTENSOR
     *************************/

    nvpltensorHandle_t handle;
    HANDLE_ERROR(nvpltensorCreate(&handle));

    /**********************
     * Set number of threads, that nvplTensor can use
     **********************/

    uint32_t const numThreads = 4;
    HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));

    /**********************
     * Create Tensor Descriptors
     **********************/

    nvpltensorTensorDescriptor_t descA;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
                 nmodeA, extentA, NULL /*stride*/,
                 typeA, kAlignment));

    nvpltensorTensorDescriptor_t descB;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
                 nmodeB, extentB, NULL /*stride*/,
                 typeB, kAlignment));

    nvpltensorTensorDescriptor_t descC;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
                 nmodeC, extentC, NULL /*stride*/,
                 typeC, kAlignment));

    printf("Initialize nvplTENSOR and tensor descriptors\n");

    /**********************
     * Free allocated data
     **********************/

    HANDLE_ERROR(nvpltensorDestroy(handle));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));

    if (A) free(A);
    if (B) free(B);
    if (C) free(C);

    return 0;
}

创建缩并描述符#

在此步骤中,我们创建编码缩并的操作描述符,并确保标量类型正确

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include <nvpl_tensor.h>

// Handle nvplTENSOR errors
#define HANDLE_ERROR(x)                                           \
    {                                                             \
        const nvpltensorStatus_t err = x;                         \
        if (err != NVPLTENSOR_STATUS_SUCCESS)                     \
        {                                                         \
            printf("Error: %s\n", nvpltensorGetErrorString(err)); \
            exit(-1);                                             \
        }                                                         \
    };

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0, 2, 1, 3};
    int32_t modeA[] = {0, 4, 5, 1};
    int32_t modeB[] = {2, 5, 3, 4};
    int const nmodeA = 4;
    int const nmodeB = 4;
    int const nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    /* ***************************** */

    // Number of elements of each tensor
    int64_t elementsA = 1;
    for (int i = 0; i < nmodeA; ++i)
    {
        elementsA *= extent[i];
    }
    int64_t elementsB = 1;
    for (int i = 0; i < nmodeB; ++i)
    {
        elementsB *= extent[i];
    }
    int64_t elementsC = 1;
    for (int i = 0; i < nmodeC; ++i)
    {
        elementsC *= extent[i];
    }

    // Size in bytes
    int64_t sizeA = sizeof(floatTypeA) * elementsA;
    int64_t sizeB = sizeof(floatTypeB) * elementsB;
    int64_t sizeC = sizeof(floatTypeC) * elementsC;

    uint32_t const kAlignment = 128;  // Alignment of the pointers (bytes)

    // Allocate
    floatTypeA* A = aligned_alloc(kAlignment, sizeA);
    floatTypeB* B = aligned_alloc(kAlignment, sizeB);
    floatTypeC* C = aligned_alloc(kAlignment, sizeC);

    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: allocation of tensor memory.\n");
        return -1;
    }

    // Initialize data
    for (int64_t i = 0; i < elementsA; i++)
        A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsB; i++)
        B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsC; i++)
        C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;

    assert((uintptr_t)A % kAlignment == 0);
    assert((uintptr_t)B % kAlignment == 0);
    assert((uintptr_t)C % kAlignment == 0);

    printf("Allocate and initialize\n");

    /*************************
     * nvplTENSOR
     *************************/

    nvpltensorHandle_t handle;
    HANDLE_ERROR(nvpltensorCreate(&handle));

    /**********************
     * Set number of threads, that nvplTensor can use
     **********************/

    uint32_t const numThreads = 4;
    HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));

    /**********************
     * Create Tensor Descriptors
     **********************/

    nvpltensorTensorDescriptor_t descA;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
                 nmodeA, extentA, NULL /*stride*/,
                 typeA, kAlignment));

    nvpltensorTensorDescriptor_t descB;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
                 nmodeB, extentB, NULL /*stride*/,
                 typeB, kAlignment));

    nvpltensorTensorDescriptor_t descC;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
                 nmodeC, extentC, NULL /*stride*/,
                 typeC, kAlignment));

    printf("Initialize nvplTENSOR and tensor descriptors\n");

    /*******************************
     * Create Contraction Descriptor
     *******************************/

    nvpltensorOperationDescriptor_t desc;
    HANDLE_ERROR(nvpltensorCreateContraction(handle, &desc,
                 descA, modeA, /* unary operator A*/ NVPLTENSOR_OP_IDENTITY,
                 descB, modeB, /* unary operator B*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, /* unary operator C*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, descCompute));

    printf("Initialize operation descriptor\n");

    /*****************************
     * Optional (but recommended): ensure that the scalar type is correct.
     *****************************/

    nvpltensorDataType_t scalarType;
    HANDLE_ERROR(nvpltensorOperationDescriptorGetAttribute(handle, desc,
                 NVPLTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
                 (void*) &scalarType, sizeof(scalarType)));

    assert(scalarType == NVPLTENSOR_R_32F);
    typedef float floatTypeCompute;
    floatTypeCompute alpha = (floatTypeCompute) 1.1f;
    floatTypeCompute beta = (floatTypeCompute) 0.f;

    printf("Check scalar type required for operation\n");

    /**********************
     * Free allocated data
     **********************/

    HANDLE_ERROR(nvpltensorDestroy(handle));
    HANDLE_ERROR(nvpltensorDestroyOperationDescriptor(desc));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));

    if (A) free(A);
    if (B) free(B);
    if (C) free(C);

    return 0;
}

确定算法和工作区#

现在我们已经定义了张量和我们想要执行的缩并,我们必须选择一种算法来执行缩并。该算法由 nvpltensorAlgo_t 指定。指定 NVPLTENSOR_ALGO_DEFAULT 允许我们让 nvplTENSOR 的内部启发式方法选择最佳方法。查找良好算法的所有信息都存储在 nvpltensorPlanPreference_t 数据结构中。我们还可以查询库以估计所提供的操作描述符的工作区需求量;用户可以在不同的 nvpltensorWorksizePreference_t 之间进行选择。对于此示例,我们使用 NVPLTENSOR_WORKSPACE_DEFAULT,这是一个很好的默认选择,旨在实现高性能,同时减少工作区需求。虽然工作区内存不是严格必需的,但强烈建议使用它以获得高性能。

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include <nvpl_tensor.h>

// Handle nvplTENSOR errors
#define HANDLE_ERROR(x)                                           \
    {                                                             \
        const nvpltensorStatus_t err = x;                         \
        if (err != NVPLTENSOR_STATUS_SUCCESS)                     \
        {                                                         \
            printf("Error: %s\n", nvpltensorGetErrorString(err)); \
            exit(-1);                                             \
        }                                                         \
    };

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0, 2, 1, 3};
    int32_t modeA[] = {0, 4, 5, 1};
    int32_t modeB[] = {2, 5, 3, 4};
    int const nmodeA = 4;
    int const nmodeB = 4;
    int const nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    /* ***************************** */

    // Number of elements of each tensor
    int64_t elementsA = 1;
    for (int i = 0; i < nmodeA; ++i)
    {
        elementsA *= extent[i];
    }
    int64_t elementsB = 1;
    for (int i = 0; i < nmodeB; ++i)
    {
        elementsB *= extent[i];
    }
    int64_t elementsC = 1;
    for (int i = 0; i < nmodeC; ++i)
    {
        elementsC *= extent[i];
    }

    // Size in bytes
    int64_t sizeA = sizeof(floatTypeA) * elementsA;
    int64_t sizeB = sizeof(floatTypeB) * elementsB;
    int64_t sizeC = sizeof(floatTypeC) * elementsC;

    uint32_t const kAlignment = 128;  // Alignment of the pointers (bytes)

    // Allocate
    floatTypeA* A = aligned_alloc(kAlignment, sizeA);
    floatTypeB* B = aligned_alloc(kAlignment, sizeB);
    floatTypeC* C = aligned_alloc(kAlignment, sizeC);

    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: allocation of tensor memory.\n");
        return -1;
    }

    // Initialize data
    for (int64_t i = 0; i < elementsA; i++)
        A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsB; i++)
        B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsC; i++)
        C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;

    assert((uintptr_t)A % kAlignment == 0);
    assert((uintptr_t)B % kAlignment == 0);
    assert((uintptr_t)C % kAlignment == 0);

    printf("Allocate and initialize\n");

    /*************************
     * nvplTENSOR
     *************************/

    nvpltensorHandle_t handle;
    HANDLE_ERROR(nvpltensorCreate(&handle));

    /**********************
     * Set number of threads, that nvplTensor can use
     **********************/

    uint32_t const numThreads = 4;
    HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));

    /**********************
     * Create Tensor Descriptors
     **********************/

    nvpltensorTensorDescriptor_t descA;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
                 nmodeA, extentA, NULL /*stride*/,
                 typeA, kAlignment));

    nvpltensorTensorDescriptor_t descB;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
                 nmodeB, extentB, NULL /*stride*/,
                 typeB, kAlignment));

    nvpltensorTensorDescriptor_t descC;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
                 nmodeC, extentC, NULL /*stride*/,
                 typeC, kAlignment));

    printf("Initialize nvplTENSOR and tensor descriptors\n");

    /*******************************
     * Create Contraction Descriptor
     *******************************/

    nvpltensorOperationDescriptor_t desc;
    HANDLE_ERROR(nvpltensorCreateContraction(handle, &desc,
                 descA, modeA, /* unary operator A*/ NVPLTENSOR_OP_IDENTITY,
                 descB, modeB, /* unary operator B*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, /* unary operator C*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, descCompute));

    printf("Initialize operation descriptor\n");

    /*****************************
     * Optional (but recommended): ensure that the scalar type is correct.
     *****************************/

    nvpltensorDataType_t scalarType;
    HANDLE_ERROR(nvpltensorOperationDescriptorGetAttribute(handle, desc,
                 NVPLTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
                 (void*) &scalarType, sizeof(scalarType)));

    assert(scalarType == NVPLTENSOR_R_32F);
    typedef float floatTypeCompute;
    floatTypeCompute alpha = (floatTypeCompute) 1.1f;
    floatTypeCompute beta = (floatTypeCompute) 0.f;

    printf("Check scalar type required for operation\n");

    /**************************
     * Set the algorithm to use
     ***************************/

    nvpltensorAlgo_t const algo = NVPLTENSOR_ALGO_DEFAULT;

    nvpltensorPlanPreference_t planPref;
    HANDLE_ERROR(nvpltensorCreatePlanPreference(handle, &planPref,
                 algo, NVPLTENSOR_JIT_MODE_NONE));

    printf("Initialize plan preference\n");

    /**********************
     * Query workspace estimate
     **********************/

    uint64_t workspaceSizeEstimate = 0;
    nvpltensorWorksizePreference_t const workspacePref = NVPLTENSOR_WORKSPACE_DEFAULT;
    HANDLE_ERROR(nvpltensorEstimateWorkspaceSize(handle, desc,
                 planPref, workspacePref, &workspaceSizeEstimate));

    printf("Estimate workspace required for operation\n");

    /**********************
     * Free allocated data
     **********************/

    HANDLE_ERROR(nvpltensorDestroy(handle));
    HANDLE_ERROR(nvpltensorDestroyPlanPreference(planPref));
    HANDLE_ERROR(nvpltensorDestroyOperationDescriptor(desc));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));

    if (A) free(A);
    if (B) free(B);
    if (C) free(C);

    return 0;
}

请注意,在此示例中我们未使用即时编译 (NVPLTENSOR_JIT_MODE_NONE),如果您想了解有关 nvplTENSOR 的 JIT 功能的更多信息,请参阅 jit-compilation-label

规划和减少工作区#

有了工作区大小的估计,我们现在可以继续创建实际的计划;此步骤依赖于 nvplTENSOR 的启发式方法来选择最合适的算法和内核

nvpltensorPlan_t plan;
HANDLE_ERROR(nvpltensorCreatePlan(handle,
             &plan,
             desc,
             planPref,
             workspaceSizeEstimate));

一旦创建了计划并选择了内核,我们可以(可选地)查询计划实际需要的工作区

uint64_t actualWorkspaceSize = 0;
HANDLE_ERROR(nvpltensorPlanGetAttribute(handle,
    plan,
    NVPLTENSOR_PLAN_REQUIRED_WORKSPACE,
    &actualWorkspaceSize,
    sizeof(actualWorkspaceSize)));

这样我们的代码就变成了

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include <nvpl_tensor.h>

// Handle nvplTENSOR errors
#define HANDLE_ERROR(x)                                           \
    {                                                             \
        const nvpltensorStatus_t err = x;                         \
        if (err != NVPLTENSOR_STATUS_SUCCESS)                     \
        {                                                         \
            printf("Error: %s\n", nvpltensorGetErrorString(err)); \
            exit(-1);                                             \
        }                                                         \
    };

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0, 2, 1, 3};
    int32_t modeA[] = {0, 4, 5, 1};
    int32_t modeB[] = {2, 5, 3, 4};
    int const nmodeA = 4;
    int const nmodeB = 4;
    int const nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    /* ***************************** */

    // Number of elements of each tensor
    int64_t elementsA = 1;
    for (int i = 0; i < nmodeA; ++i)
    {
        elementsA *= extent[i];
    }
    int64_t elementsB = 1;
    for (int i = 0; i < nmodeB; ++i)
    {
        elementsB *= extent[i];
    }
    int64_t elementsC = 1;
    for (int i = 0; i < nmodeC; ++i)
    {
        elementsC *= extent[i];
    }

    // Size in bytes
    int64_t sizeA = sizeof(floatTypeA) * elementsA;
    int64_t sizeB = sizeof(floatTypeB) * elementsB;
    int64_t sizeC = sizeof(floatTypeC) * elementsC;

    uint32_t const kAlignment = 128;  // Alignment of the pointers (bytes)

    // Allocate
    floatTypeA* A = aligned_alloc(kAlignment, sizeA);
    floatTypeB* B = aligned_alloc(kAlignment, sizeB);
    floatTypeC* C = aligned_alloc(kAlignment, sizeC);

    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: allocation of tensor memory.\n");
        return -1;
    }

    // Initialize data
    for (int64_t i = 0; i < elementsA; i++)
        A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsB; i++)
        B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsC; i++)
        C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;

    assert((uintptr_t)A % kAlignment == 0);
    assert((uintptr_t)B % kAlignment == 0);
    assert((uintptr_t)C % kAlignment == 0);

    printf("Allocate and initialize\n");

    /*************************
     * nvplTENSOR
     *************************/

    nvpltensorHandle_t handle;
    HANDLE_ERROR(nvpltensorCreate(&handle));

    /**********************
     * Set number of threads, that nvplTensor can use
     **********************/

    uint32_t const numThreads = 4;
    HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));

    /**********************
     * Create Tensor Descriptors
     **********************/

    nvpltensorTensorDescriptor_t descA;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
                 nmodeA, extentA, NULL /*stride*/,
                 typeA, kAlignment));

    nvpltensorTensorDescriptor_t descB;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
                 nmodeB, extentB, NULL /*stride*/,
                 typeB, kAlignment));

    nvpltensorTensorDescriptor_t descC;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
                 nmodeC, extentC, NULL /*stride*/,
                 typeC, kAlignment));

    printf("Initialize nvplTENSOR and tensor descriptors\n");

    /*******************************
     * Create Contraction Descriptor
     *******************************/

    nvpltensorOperationDescriptor_t desc;
    HANDLE_ERROR(nvpltensorCreateContraction(handle, &desc,
                 descA, modeA, /* unary operator A*/ NVPLTENSOR_OP_IDENTITY,
                 descB, modeB, /* unary operator B*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, /* unary operator C*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, descCompute));

    printf("Initialize operation descriptor\n");

    /*****************************
     * Optional (but recommended): ensure that the scalar type is correct.
     *****************************/

    nvpltensorDataType_t scalarType;
    HANDLE_ERROR(nvpltensorOperationDescriptorGetAttribute(handle, desc,
                 NVPLTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
                 (void*) &scalarType, sizeof(scalarType)));

    assert(scalarType == NVPLTENSOR_R_32F);
    typedef float floatTypeCompute;
    floatTypeCompute alpha = (floatTypeCompute) 1.1f;
    floatTypeCompute beta = (floatTypeCompute) 0.f;

    printf("Check scalar type required for operation\n");

    /**************************
     * Set the algorithm to use
     ***************************/

    nvpltensorAlgo_t const algo = NVPLTENSOR_ALGO_DEFAULT;

    nvpltensorPlanPreference_t planPref;
    HANDLE_ERROR(nvpltensorCreatePlanPreference(handle, &planPref,
                 algo, NVPLTENSOR_JIT_MODE_NONE));

    printf("Initialize plan preference\n");

    /**********************
     * Query workspace estimate
     **********************/

    uint64_t workspaceSizeEstimate = 0;
    nvpltensorWorksizePreference_t const workspacePref = NVPLTENSOR_WORKSPACE_DEFAULT;
    HANDLE_ERROR(nvpltensorEstimateWorkspaceSize(handle, desc,
                 planPref, workspacePref, &workspaceSizeEstimate));

    printf("Estimate workspace required for operation\n");

    /**************************
     * Create Contraction Plan
     **************************/

    nvpltensorPlan_t plan;
    HANDLE_ERROR(nvpltensorCreatePlan(handle, &plan,
                 desc, planPref, workspaceSizeEstimate));

    printf("Initialize plan\n");

    /**************************
     * Optional: Query information about the created plan
     **************************/

    // query actually used workspace
    uint64_t actualWorkspaceSize = 0;
    HANDLE_ERROR(nvpltensorPlanGetAttribute(handle, plan,
                 NVPLTENSOR_PLAN_REQUIRED_WORKSPACE,
                 &actualWorkspaceSize, sizeof(actualWorkspaceSize)));

    // At this point the user knows exactly how much memory is need by the operation and
    // only the smaller actual workspace needs to be allocated
    assert(actualWorkspaceSize <= workspaceSizeEstimate);
    actualWorkspaceSize += 256;

    printf("Query information about the created plan\n");

    void* work = NULL;
    if (actualWorkspaceSize > 0)
    {
        work = aligned_alloc(kAlignment, actualWorkspaceSize);
    }

    /**********************
     * Free allocated data
     **********************/

    HANDLE_ERROR(nvpltensorDestroy(handle));
    HANDLE_ERROR(nvpltensorDestroyPlan(plan));
    HANDLE_ERROR(nvpltensorDestroyPlanPreference(planPref));
    HANDLE_ERROR(nvpltensorDestroyOperationDescriptor(desc));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));

    if (A) free(A);
    if (B) free(B);
    if (C) free(C);
    if (work) free(work);

    return 0;
}

执行#

最后,我们准备好执行张量缩并并销毁(释放)所有已分配的资源

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include <nvpl_tensor.h>

// Handle nvplTENSOR errors
#define HANDLE_ERROR(x)                                           \
    {                                                             \
        const nvpltensorStatus_t err = x;                         \
        if (err != NVPLTENSOR_STATUS_SUCCESS)                     \
        {                                                         \
            printf("Error: %s\n", nvpltensorGetErrorString(err)); \
            exit(-1);                                             \
        }                                                         \
    };

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0, 2, 1, 3};
    int32_t modeA[] = {0, 4, 5, 1};
    int32_t modeB[] = {2, 5, 3, 4};
    int const nmodeA = 4;
    int const nmodeB = 4;
    int const nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    /* ***************************** */

    // Number of elements of each tensor
    int64_t elementsA = 1;
    for (int i = 0; i < nmodeA; ++i)
    {
        elementsA *= extent[i];
    }
    int64_t elementsB = 1;
    for (int i = 0; i < nmodeB; ++i)
    {
        elementsB *= extent[i];
    }
    int64_t elementsC = 1;
    for (int i = 0; i < nmodeC; ++i)
    {
        elementsC *= extent[i];
    }

    // Size in bytes
    int64_t sizeA = sizeof(floatTypeA) * elementsA;
    int64_t sizeB = sizeof(floatTypeB) * elementsB;
    int64_t sizeC = sizeof(floatTypeC) * elementsC;

    uint32_t const kAlignment = 128;  // Alignment of the pointers (bytes)

    // Allocate
    floatTypeA* A = aligned_alloc(kAlignment, sizeA);
    floatTypeB* B = aligned_alloc(kAlignment, sizeB);
    floatTypeC* C = aligned_alloc(kAlignment, sizeC);

    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: allocation of tensor memory.\n");
        return -1;
    }

    // Initialize data
    for (int64_t i = 0; i < elementsA; i++)
        A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsB; i++)
        B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsC; i++)
        C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;

    assert((uintptr_t)A % kAlignment == 0);
    assert((uintptr_t)B % kAlignment == 0);
    assert((uintptr_t)C % kAlignment == 0);

    printf("Allocate and initialize\n");

    /*************************
     * nvplTENSOR
     *************************/

    nvpltensorHandle_t handle;
    HANDLE_ERROR(nvpltensorCreate(&handle));

    /**********************
     * Set number of threads, that nvplTensor can use
     **********************/

    uint32_t const numThreads = 4;
    HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));

    /**********************
     * Create Tensor Descriptors
     **********************/

    nvpltensorTensorDescriptor_t descA;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
                 nmodeA, extentA, NULL /*stride*/,
                 typeA, kAlignment));

    nvpltensorTensorDescriptor_t descB;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
                 nmodeB, extentB, NULL /*stride*/,
                 typeB, kAlignment));

    nvpltensorTensorDescriptor_t descC;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
                 nmodeC, extentC, NULL /*stride*/,
                 typeC, kAlignment));

    printf("Initialize nvplTENSOR and tensor descriptors\n");

    /*******************************
     * Create Contraction Descriptor
     *******************************/

    nvpltensorOperationDescriptor_t desc;
    HANDLE_ERROR(nvpltensorCreateContraction(handle, &desc,
                 descA, modeA, /* unary operator A*/ NVPLTENSOR_OP_IDENTITY,
                 descB, modeB, /* unary operator B*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, /* unary operator C*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, descCompute));

    printf("Initialize operation descriptor\n");

    /*****************************
     * Optional (but recommended): ensure that the scalar type is correct.
     *****************************/

    nvpltensorDataType_t scalarType;
    HANDLE_ERROR(nvpltensorOperationDescriptorGetAttribute(handle, desc,
                 NVPLTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
                 (void*) &scalarType, sizeof(scalarType)));

    assert(scalarType == NVPLTENSOR_R_32F);
    typedef float floatTypeCompute;
    floatTypeCompute alpha = (floatTypeCompute) 1.1f;
    floatTypeCompute beta = (floatTypeCompute) 0.f;

    printf("Check scalar type required for operation\n");

    /**************************
     * Set the algorithm to use
     ***************************/

    nvpltensorAlgo_t const algo = NVPLTENSOR_ALGO_DEFAULT;

    nvpltensorPlanPreference_t planPref;
    HANDLE_ERROR(nvpltensorCreatePlanPreference(handle, &planPref,
                 algo, NVPLTENSOR_JIT_MODE_NONE));

    printf("Initialize plan preference\n");

    /**********************
     * Query workspace estimate
     **********************/

    uint64_t workspaceSizeEstimate = 0;
    nvpltensorWorksizePreference_t const workspacePref = NVPLTENSOR_WORKSPACE_DEFAULT;
    HANDLE_ERROR(nvpltensorEstimateWorkspaceSize(handle, desc,
                 planPref, workspacePref, &workspaceSizeEstimate));

    printf("Estimate workspace required for operation\n");

    /**************************
     * Create Contraction Plan
     **************************/

    nvpltensorPlan_t plan;
    HANDLE_ERROR(nvpltensorCreatePlan(handle, &plan,
                 desc, planPref, workspaceSizeEstimate));

    printf("Initialize plan\n");

    /**************************
     * Optional: Query information about the created plan
     **************************/

    // query actually used workspace
    uint64_t actualWorkspaceSize = 0;
    HANDLE_ERROR(nvpltensorPlanGetAttribute(handle, plan,
                 NVPLTENSOR_PLAN_REQUIRED_WORKSPACE,
                 &actualWorkspaceSize, sizeof(actualWorkspaceSize)));

    // At this point the user knows exactly how much memory is need by the operation and
    // only the smaller actual workspace needs to be allocated
    assert(actualWorkspaceSize <= workspaceSizeEstimate);
    actualWorkspaceSize += 256;

    printf("Query information about the created plan\n");

    void* work = NULL;
    if (actualWorkspaceSize > 0)
    {
        work = aligned_alloc(kAlignment, actualWorkspaceSize);
    }

    /**********************
     * Execute
     **********************/

    HANDLE_ERROR(nvpltensorContract(handle, plan,
                 (void*) &alpha, A, B, (void*) &beta, C, C,
                 work, actualWorkspaceSize));

    printf("Perform operation\n");

    /**********************
     * Free allocated data
     **********************/

    HANDLE_ERROR(nvpltensorDestroy(handle));
    HANDLE_ERROR(nvpltensorDestroyPlan(plan));
    HANDLE_ERROR(nvpltensorDestroyPlanPreference(planPref));
    HANDLE_ERROR(nvpltensorDestroyOperationDescriptor(desc));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));

    if (A) free(A);
    if (B) free(B);
    if (C) free(C);
    if (work) free(work);

    return 0;
}

就是这样。我们已经通过 nvplTENSOR 执行了我们的第一个缩并!您可以在 example 目录中找到此示例和其他示例。