即时 (JIT) 编译¶
本节介绍即时 (JIT) 编译功能。此功能允许用户编译专用内核,以最大限度地提高特定操作的性能。
给定缩并的复杂性(例如,模式的数量和顺序、缩并模式的数量)决定了其内核搜索空间的大小(即,可用于执行缩并的候选内核集)。随着复杂性的增加,搜索空间可能会变得非常大。我们的预编译内核经过精心选择,可以在各种不同的缩并操作中表现良好。然而,随着缩并复杂性的增加,为给定缩并量身定制的即时编译内核可以胜过来自固定大小的预编译内核集的内核。
JIT 编译通过创建一个内核来克服此限制,该内核可以更好地利用适用于给定缩并的优化机会。
编译内核的成本通常为 1-8 秒(取决于内核和主机 CPU);此成本在规划阶段每个内核仅发生一次;内核可以被后续的缩并操作重用(即,内核在编译后会自动缓存)。
所有 JIT 编译的内核都添加到内核缓存中,整个库都可以访问该缓存(即,在库句柄之间共享)。我们提供了在磁盘上读写内核缓存的函数,以避免在重新运行程序时重复 JIT 编译相同内核的成本。
本节的其余部分假设您熟悉 入门指南。
注意
JIT 编译功能仅适用于计算能力大于或等于 8.0(Ampere 或更新版本)的 GPU。此外,此功能目前仅限于张量缩并。
入门示例¶
本小节提供与 JIT 编译相关的 API 调用和功能的基本概述。
我们首先使用与 入门指南 中描述的相同步骤计算缩并,但使用不同的缩并示例来强调当缩并模式数量增加时 JIT 编译的优势。然后,我们描述启用 JIT 编译的必要修改,并比较预编译内核和 JIT 编译内核的性能。
我们的代码计算以下操作(请注意,我们现在使用数字而不是字母来命名每个模式,因为模式的数量超过了拉丁字母的数量)
所有操作数都包含单精度复数值,并且计算使用模拟的单精度算术 (3XTF32) 执行。所有模式的范围均为 2。
计算上述缩并的步骤与 入门指南 中的步骤相同,并列出如下
#include <chrono>
#include <complex>
#include <stdlib.h>
#include <stdio.h>
#include <unordered_map>
#include <vector>
#include <cuda_runtime.h>
#include <cutensor.h>
// Handle cuTENSOR errors
#define HANDLE_ERROR(x) { \
const auto err = x; \
if( err != CUTENSOR_STATUS_SUCCESS ) \
{ printf("Error: %s in line %d\n", cutensorGetErrorString(err), __LINE__); exit(-1); } \
};
// Handle CUDA errors
#define HANDLE_CUDA_ERROR(x) { \
const auto err = x; \
if( err != cudaSuccess ) \
{ printf("Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); exit(-1); } \
};
class CPUTimer
{
public:
void start()
{
start_ = std::chrono::steady_clock::now();
}
double seconds()
{
end_ = std::chrono::steady_clock::now();
elapsed_ = end_ - start_;
//return in ms
return elapsed_.count() * 1000;
}
private:
typedef std::chrono::steady_clock::time_point tp;
tp start_;
tp end_;
std::chrono::duration<double> elapsed_;
};
struct GPUTimer
{
GPUTimer()
{
cudaEventCreate(&start_);
cudaEventCreate(&stop_);
cudaEventRecord(start_, 0);
}
~GPUTimer()
{
cudaEventDestroy(start_);
cudaEventDestroy(stop_);
}
void start()
{
cudaEventRecord(start_, 0);
}
float seconds()
{
cudaEventRecord(stop_, 0);
cudaEventSynchronize(stop_);
float time;
cudaEventElapsedTime(&time, start_, stop_);
return time * 1e-3;
}
private:
cudaEvent_t start_, stop_;
};
int main()
{
typedef std::complex<float> TypeA;
typedef std::complex<float> TypeB;
typedef std::complex<float> TypeC;
typedef std::complex<float> TypeScalar;
auto alpha = TypeScalar(1.1, 0.0);
auto beta = TypeScalar(0.0, 0.0);
// CUDA types
cutensorDataType_t typeA = CUTENSOR_C_32F;
cutensorDataType_t typeB = CUTENSOR_C_32F;
cutensorDataType_t typeC = CUTENSOR_C_32F;
cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_3XTF32;
/* ***************************** */
// Create vector of modes
std::vector<int> modeC{0,1,2,3,4,6,8,9,25,26,10,12,14,27,15,28,17,19,29,20,21,30,23,24};
std::vector<int> modeA{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24};
std::vector<int> modeB{25,26,27,28,29,30,5,7,11,13,16,18,22};
int nmodeA = modeA.size();
int nmodeB = modeB.size();
int nmodeC = modeC.size();
// Extents
std::unordered_map<int, int64_t> extent;
for (auto i = 0; i <= 30; i++)
extent[i] = 2;
// Create a vector of extents for each tensor
std::vector<int64_t> extentC;
for (auto mode : modeC)
extentC.push_back(extent[mode]);
std::vector<int64_t> extentA;
for (auto mode : modeA)
extentA.push_back(extent[mode]);
std::vector<int64_t> extentB;
for (auto mode : modeB)
extentB.push_back(extent[mode]);
/**********************
* Allocating data
**********************/
// Number of elements of each tensor
size_t elementsA = 1;
for (auto mode : modeA)
elementsA *= extent[mode];
size_t elementsB = 1;
for (auto mode : modeB)
elementsB *= extent[mode];
size_t elementsC = 1;
for (auto mode : modeC)
elementsC *= extent[mode];
// Size in bytes
size_t sizeA = sizeof(TypeA) * elementsA;
size_t sizeB = sizeof(TypeB) * elementsB;
size_t sizeC = sizeof(TypeC) * elementsC;
printf("Total memory: %.2f GiB\n", (sizeA + sizeB + sizeC)/1024./1024./1024);
// Allocate on device
void *A_d, *B_d, *C_d;
HANDLE_CUDA_ERROR(cudaMalloc((void**) &A_d, sizeA));
HANDLE_CUDA_ERROR(cudaMalloc((void**) &B_d, sizeB));
HANDLE_CUDA_ERROR(cudaMalloc((void**) &C_d, sizeC));
// Allocate on host
TypeA *A = (TypeA*) malloc(sizeof(TypeA) * elementsA);
TypeB *B = (TypeB*) malloc(sizeof(TypeB) * elementsB);
TypeC *C = (TypeC*) malloc(sizeof(TypeC) * elementsC);
if (A == nullptr || B == nullptr || C == nullptr)
{
printf("Error: Host allocation of A, B, or C.\n");
exit(-1);
}
/*******************
* Initialize data
*******************/
for (int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for (int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for (int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
// Copy to device
HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));
const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes)
/*************************
* cuTENSOR
*************************/
// Initialize cuTENSOR library
cutensorHandle_t handle;
HANDLE_ERROR(cutensorCreate(&handle));
/**********************
* Create Tensor Descriptors
**********************/
cutensorTensorDescriptor_t descA;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descA,
nmodeA,
extentA.data(),
NULL,/*stride*/
typeA, kAlignment));
cutensorTensorDescriptor_t descB;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descB,
nmodeB,
extentB.data(),
NULL,/*stride*/
typeB, kAlignment));
cutensorTensorDescriptor_t descC;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descC,
nmodeC,
extentC.data(),
NULL,/*stride*/
typeC, kAlignment));
/*******************************
* Create Contraction Descriptor
*******************************/
cutensorOperationDescriptor_t desc;
HANDLE_ERROR(cutensorCreateContraction(handle,
&desc,
descA, modeA.data(), /* unary operator A*/CUTENSOR_OP_IDENTITY,
descB, modeB.data(), /* unary operator B*/CUTENSOR_OP_IDENTITY,
descC, modeC.data(), /* unary operator C*/CUTENSOR_OP_IDENTITY,
descC, modeC.data(),
descCompute));
/**************************
* Set the algorithm to use -- without just-in-time compilation
***************************/
const cutensorAlgo_t algo = CUTENSOR_ALGO_GETT;
cutensorPlanPreference_t planPref;
HANDLE_ERROR(cutensorCreatePlanPreference(handle,
&planPref,
algo,
CUTENSOR_JIT_MODE_NONE));
/**********************
* Query workspace estimate
**********************/
uint64_t workspaceSizeEstimate = 0;
const cutensorWorksizePreference_t workspacePref = CUTENSOR_WORKSPACE_DEFAULT;
HANDLE_ERROR(cutensorEstimateWorkspaceSize(handle,
desc,
planPref,
workspacePref,
&workspaceSizeEstimate));
// Allocate workspace
void *work = nullptr;
if (workspaceSizeEstimate > 0)
{
HANDLE_CUDA_ERROR(cudaMalloc(&work, workspaceSizeEstimate));
}
/**************************
* Create Contraction Plan -- without just-in-time compilation
**************************/
cutensorPlan_t plan;
HANDLE_ERROR(cutensorCreatePlan(handle,
&plan,
desc,
planPref,
workspaceSizeEstimate));
/**********************
* Execute the tensor contraction
**********************/
cudaStream_t stream;
HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
double minTimeCUTENSOR = 1e100;
for (int i=0; i < 3; ++i)
{
cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice);
// Set up timing
GPUTimer timer;
timer.start();
HANDLE_ERROR(cutensorContract(handle,
plan,
(void*) &alpha, A_d, B_d,
(void*) &beta, C_d, C_d,
work, workspaceSizeEstimate, stream))
// Synchronize and measure timing
auto time = timer.seconds();
minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
}
/*************************/
float flops;
HANDLE_ERROR(cutensorOperationDescriptorGetAttribute(handle,
desc,
CUTENSOR_OPERATION_DESCRIPTOR_FLOPS,
(void*)&flops,
sizeof(flops)));
auto gflops = flops / 1e9;
auto gflopsPerSec = gflops / minTimeCUTENSOR;
printf("cuTENSOR : %6.0f GFLOPs/s\n", gflopsPerSec);
HANDLE_ERROR(cutensorDestroy(handle));
HANDLE_ERROR(cutensorDestroyOperationDescriptor(desc));
HANDLE_ERROR(cutensorDestroyTensorDescriptor(descA));
HANDLE_ERROR(cutensorDestroyTensorDescriptor(descB));
HANDLE_ERROR(cutensorDestroyTensorDescriptor(descC));
HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
HANDLE_ERROR(cutensorDestroyPlanPreference(planPref));
HANDLE_ERROR(cutensorDestroyPlan(plan));
if (A) free(A);
if (B) free(B);
if (C) free(C);
if (A_d) cudaFree(A_d);
if (B_d) cudaFree(B_d);
if (C_d) cudaFree(C_d);
if (work) cudaFree(work);
printf("Successful completion\n");
return 0;
}
启用 JIT 编译所需的全部操作是将 cutensorCreatePlanPreference() 的最后一个参数从 CUTENSOR_JIT_MODE_NONE
更改为 CUTENSOR_JIT_MODE_DEFAULT
——无需进一步更改
cutensorPlanPreference_t planPrefJit;
cutensorCreatePlanPreference(handle,
&planPrefJit,
algo,
CUTENSOR_JIT_MODE_DEFAULT);
内核在调用 cutensorCreatePlan() 期间编译。此调用是阻塞的,编译过程可能需要几秒钟。创建 plan 后,内核将被编译并存储在内核缓存中(请参阅 在磁盘上读写内核缓存),以便通过调用 cutensorContract() 使用。
注意
要在后续缩并操作(使用不同的 plan)中重用 JIT 编译的内核,用户必须再次在 cutensorCreatePlanPreference() 期间设置 CUTENSOR_JIT_MODE_DEFAULT
。否则,将使用预编译内核。
完整的可运行示例如下
#include <chrono>
#include <complex>
#include <stdlib.h>
#include <stdio.h>
#include <unordered_map>
#include <vector>
#include <cuda_runtime.h>
#include <cutensor.h>
// Handle cuTENSOR errors
#define HANDLE_ERROR(x) { \
const auto err = x; \
if( err != CUTENSOR_STATUS_SUCCESS ) \
{ printf("Error: %s in line %d\n", cutensorGetErrorString(err), __LINE__); exit(-1); } \
};
// Handle CUDA errors
#define HANDLE_CUDA_ERROR(x) { \
const auto err = x; \
if( err != cudaSuccess ) \
{ printf("Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); exit(-1); } \
};
class CPUTimer
{
public:
void start()
{
start_ = std::chrono::steady_clock::now();
}
double seconds()
{
end_ = std::chrono::steady_clock::now();
elapsed_ = end_ - start_;
//return in ms
return elapsed_.count() * 1000;
}
private:
typedef std::chrono::steady_clock::time_point tp;
tp start_;
tp end_;
std::chrono::duration<double> elapsed_;
};
struct GPUTimer
{
GPUTimer()
{
cudaEventCreate(&start_);
cudaEventCreate(&stop_);
cudaEventRecord(start_, 0);
}
~GPUTimer()
{
cudaEventDestroy(start_);
cudaEventDestroy(stop_);
}
void start()
{
cudaEventRecord(start_, 0);
}
float seconds()
{
cudaEventRecord(stop_, 0);
cudaEventSynchronize(stop_);
float time;
cudaEventElapsedTime(&time, start_, stop_);
return time * 1e-3;
}
private:
cudaEvent_t start_, stop_;
};
int main()
{
typedef std::complex<float> TypeA;
typedef std::complex<float> TypeB;
typedef std::complex<float> TypeC;
typedef std::complex<float> TypeScalar;
auto alpha = TypeScalar(1.1, 0.0);
auto beta = TypeScalar(0.0, 0.0);
// CUDA types
cutensorDataType_t typeA = CUTENSOR_C_32F;
cutensorDataType_t typeB = CUTENSOR_C_32F;
cutensorDataType_t typeC = CUTENSOR_C_32F;
cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_3XTF32;
/* ***************************** */
// Create vector of modes
std::vector<int> modeC{0,1,2,3,4,6,8,9,25,26,10,12,14,27,15,28,17,19,29,20,21,30,23,24};
std::vector<int> modeA{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24};
std::vector<int> modeB{25,26,27,28,29,30,5,7,11,13,16,18,22};
int nmodeA = modeA.size();
int nmodeB = modeB.size();
int nmodeC = modeC.size();
// Extents
std::unordered_map<int, int64_t> extent;
for (auto i = 0; i <= 30; i++)
extent[i] = 2;
// Create a vector of extents for each tensor
std::vector<int64_t> extentC;
for (auto mode : modeC)
extentC.push_back(extent[mode]);
std::vector<int64_t> extentA;
for (auto mode : modeA)
extentA.push_back(extent[mode]);
std::vector<int64_t> extentB;
for (auto mode : modeB)
extentB.push_back(extent[mode]);
/**********************
* Allocating data
**********************/
// Number of elements of each tensor
size_t elementsA = 1;
for (auto mode : modeA)
elementsA *= extent[mode];
size_t elementsB = 1;
for (auto mode : modeB)
elementsB *= extent[mode];
size_t elementsC = 1;
for (auto mode : modeC)
elementsC *= extent[mode];
// Size in bytes
size_t sizeA = sizeof(TypeA) * elementsA;
size_t sizeB = sizeof(TypeB) * elementsB;
size_t sizeC = sizeof(TypeC) * elementsC;
printf("Total memory: %.2f GiB\n", (sizeA + sizeB + sizeC)/1024./1024./1024);
// Allocate on device
void *A_d, *B_d, *C_d;
HANDLE_CUDA_ERROR(cudaMalloc((void**) &A_d, sizeA));
HANDLE_CUDA_ERROR(cudaMalloc((void**) &B_d, sizeB));
HANDLE_CUDA_ERROR(cudaMalloc((void**) &C_d, sizeC));
// Allocate on host
TypeA *A = (TypeA*) malloc(sizeof(TypeA) * elementsA);
TypeB *B = (TypeB*) malloc(sizeof(TypeB) * elementsB);
TypeC *C = (TypeC*) malloc(sizeof(TypeC) * elementsC);
if (A == nullptr || B == nullptr || C == nullptr)
{
printf("Error: Host allocation of A, B, or C.\n");
exit(-1);
}
/*******************
* Initialize data
*******************/
for (int64_t i = 0; i < elementsA; i++)
A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for (int64_t i = 0; i < elementsB; i++)
B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
for (int64_t i = 0; i < elementsC; i++)
C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
// Copy to device
HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));
const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes)
/*************************
* cuTENSOR
*************************/
// Initialize cuTENSOR library
cutensorHandle_t handle;
HANDLE_ERROR(cutensorCreate(&handle));
/**********************
* Create Tensor Descriptors
**********************/
cutensorTensorDescriptor_t descA;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descA,
nmodeA,
extentA.data(),
NULL,/*stride*/
typeA, kAlignment));
cutensorTensorDescriptor_t descB;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descB,
nmodeB,
extentB.data(),
NULL,/*stride*/
typeB, kAlignment));
cutensorTensorDescriptor_t descC;
HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
&descC,
nmodeC,
extentC.data(),
NULL,/*stride*/
typeC, kAlignment));
/*******************************
* Create Contraction Descriptor
*******************************/
cutensorOperationDescriptor_t desc;
HANDLE_ERROR(cutensorCreateContraction(handle,
&desc,
descA, modeA.data(), /* unary operator A*/CUTENSOR_OP_IDENTITY,
descB, modeB.data(), /* unary operator B*/CUTENSOR_OP_IDENTITY,
descC, modeC.data(), /* unary operator C*/CUTENSOR_OP_IDENTITY,
descC, modeC.data(),
descCompute));
/**************************
* Set the algorithm to use -- without just-in-time compilation
***************************/
const cutensorAlgo_t algo = CUTENSOR_ALGO_GETT;
cutensorPlanPreference_t planPref;
HANDLE_ERROR(cutensorCreatePlanPreference(handle,
&planPref,
algo,
CUTENSOR_JIT_MODE_NONE));
/**********************
* Query workspace estimate
**********************/
uint64_t workspaceSizeEstimate = 0;
const cutensorWorksizePreference_t workspacePref = CUTENSOR_WORKSPACE_DEFAULT;
HANDLE_ERROR(cutensorEstimateWorkspaceSize(handle,
desc,
planPref,
workspacePref,
&workspaceSizeEstimate));
// Allocate workspace
void *work = nullptr;
if (workspaceSizeEstimate > 0)
{
HANDLE_CUDA_ERROR(cudaMalloc(&work, workspaceSizeEstimate));
}
/**************************
* Create Contraction Plan -- without just-in-time compilation
**************************/
cutensorPlan_t plan;
HANDLE_ERROR(cutensorCreatePlan(handle,
&plan,
desc,
planPref,
workspaceSizeEstimate));
/**********************
* Execute the tensor contraction
**********************/
cudaStream_t stream;
HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
double minTimeCUTENSOR = 1e100;
for (int i=0; i < 3; ++i)
{
cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice);
// Set up timing
GPUTimer timer;
timer.start();
HANDLE_ERROR(cutensorContract(handle,
plan,
(void*) &alpha, A_d, B_d,
(void*) &beta, C_d, C_d,
work, workspaceSizeEstimate, stream))
// Synchronize and measure timing
auto time = timer.seconds();
minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
}
/*************************/
/**************************
* Set the algorithm to use -- with just-in-time compilation
***************************/
cutensorPlanPreference_t planPrefJit;
HANDLE_ERROR(cutensorCreatePlanPreference(handle,
&planPrefJit,
algo,
CUTENSOR_JIT_MODE_DEFAULT));
/**********************
* Query workspace estimate
**********************/
uint64_t workspaceSizeEstimateJit = 0;
const cutensorWorksizePreference_t workspacePrefJit = CUTENSOR_WORKSPACE_DEFAULT;
HANDLE_ERROR(cutensorEstimateWorkspaceSize(handle,
desc,
planPrefJit,
workspacePrefJit,
&workspaceSizeEstimateJit));
// Allocate workspace
void *workJit = nullptr;
if (workspaceSizeEstimateJit > 0)
{
HANDLE_CUDA_ERROR(cudaMalloc(&workJit, workspaceSizeEstimateJit));
}
/**************************
* Create Contraction Plan -- with just-in-time compilation
**************************/
cutensorPlan_t planJit;
CPUTimer jitPlanTimer;
jitPlanTimer.start();
// This is where the kernel is actually compiled
HANDLE_ERROR(cutensorCreatePlan(handle,
&planJit,
desc,
planPrefJit,
workspaceSizeEstimateJit));
auto jitPlanTime = jitPlanTimer.seconds();
/**********************
* Execute the tensor contraction using the JIT compiled kernel
**********************/
double minTimeCUTENSORJit = 1e100;
for (int i=0; i < 3; ++i)
{
cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice);
// Set up timing
GPUTimer timer;
timer.start();
HANDLE_ERROR(cutensorContract(handle,
planJit,
(void*) &alpha, A_d, B_d,
(void*) &beta, C_d, C_d,
workJit, workspaceSizeEstimateJit, stream))
// Synchronize and measure timing
auto time = timer.seconds();
minTimeCUTENSORJit = (minTimeCUTENSORJit < time) ? minTimeCUTENSORJit : time;
}
/*************************/
float flops;
HANDLE_ERROR(cutensorOperationDescriptorGetAttribute(handle,
desc,
CUTENSOR_OPERATION_DESCRIPTOR_FLOPS,
(void*)&flops,
sizeof(flops)));
auto gflops = flops / 1e9;
auto gflopsPerSec = gflops / minTimeCUTENSOR;
auto gflopsPerSecJit = gflops / minTimeCUTENSORJit;
printf("cuTENSOR : %6.0f GFLOPs/s\n", gflopsPerSec);
printf("cuTENSOR JIT: %6.0f GFLOPs/s\n", gflopsPerSecJit);
printf("Speedup: %.1fx\n", gflopsPerSecJit / gflopsPerSec);
printf("JIT Compilation time: %.1f seconds\n", jitPlanTime / 1e3);
HANDLE_ERROR(cutensorDestroy(handle));
HANDLE_ERROR(cutensorDestroyOperationDescriptor(desc));
HANDLE_ERROR(cutensorDestroyTensorDescriptor(descA));
HANDLE_ERROR(cutensorDestroyTensorDescriptor(descB));
HANDLE_ERROR(cutensorDestroyTensorDescriptor(descC));
HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
HANDLE_ERROR(cutensorDestroyPlanPreference(planPref));
HANDLE_ERROR(cutensorDestroyPlan(plan));
HANDLE_ERROR(cutensorDestroyPlanPreference(planPrefJit));
HANDLE_ERROR(cutensorDestroyPlan(planJit));
if (A) free(A);
if (B) free(B);
if (C) free(C);
if (A_d) cudaFree(A_d);
if (B_d) cudaFree(B_d);
if (C_d) cudaFree(C_d);
if (work) cudaFree(work);
if (workJit) cudaFree(workJit);
printf("Successful completion\n");
return 0;
}
以下是上述程序在 NVIDIA H100 PCIe GPU 上的输出
cuTENSOR : 774 GFLOPs/s
cuTENSOR JIT: 5374 GFLOPs/s
Speedup: 6.9x
JIT Compilation time: 8.3 seconds
入门示例到此结束。
在磁盘上读写内核缓存¶
JIT 编译可能需要大量时间,特别是当应用于数十或数百个不同的缩并操作时。为了分摊此开销,我们在创建所有 plan 后提供了将内核缓存写入磁盘的功能。这样,在程序的多次执行中,每个 plan 的编译成本仅发生一次。
注意
内核缓存文件存储有关所使用的 cuTENSOR 版本 (CUTENSOR_VERSION)、系统上 CUDA 版本 (CUDA_VERSION) 和 GPU 型号 (GPU_MODEL) 的信息。为了成功读取内核缓存文件,这三个值必须在目标系统上完全匹配。
要在创建所有使用 JIT 编译的 plan 后,将内核缓存写入文件,请使用 cutensorWriteKernelCacheToFile() 函数。
cutensorWriteKernelCacheToFile(handle, "kernelCache.bin")
要读取文件并将内核加载到正在运行的 cuTENSOR 实例中,只需使用 cutensorReadKernelCacheFromFile() 函数
cutensorReadKernelCacheFromFile(handle, "kernelCache.bin")
注意
从文件读取内核缓存后,用户仍然必须在 cutensorCreatePlanPreference() 期间启用 JIT 编译(通过提供 CUTENSOR_JIT_MODE_DEFAULT
参数),以便用于那些应该使用先前 JIT 编译内核的缩并操作。
下面,我们重复 入门示例,但在第 188 行我们检查是否可以读取内核缓存文件,并在第 399 行我们将内核缓存写入文件。在第二次执行以下示例时,将读取内核缓存并避免编译。
1#include <chrono>
2#include <complex>
3#include <stdlib.h>
4#include <stdio.h>
5#include <unordered_map>
6#include <vector>
7
8#include <cuda_runtime.h>
9#include <cutensor.h>
10
11// Handle cuTENSOR errors
12#define HANDLE_ERROR(x) { \
13 const auto err = x; \
14 if( err != CUTENSOR_STATUS_SUCCESS ) \
15 { printf("Error: %s in line %d\n", cutensorGetErrorString(err), __LINE__); exit(-1); } \
16};
17
18// Handle CUDA errors
19#define HANDLE_CUDA_ERROR(x) { \
20 const auto err = x; \
21 if( err != cudaSuccess ) \
22 { printf("Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); exit(-1); } \
23};
24
25class CPUTimer
26{
27public:
28 void start()
29 {
30 start_ = std::chrono::steady_clock::now();
31 }
32
33 double seconds()
34 {
35 end_ = std::chrono::steady_clock::now();
36 elapsed_ = end_ - start_;
37 //return in ms
38 return elapsed_.count() * 1000;
39 }
40
41private:
42 typedef std::chrono::steady_clock::time_point tp;
43 tp start_;
44 tp end_;
45 std::chrono::duration<double> elapsed_;
46};
47
48struct GPUTimer
49{
50 GPUTimer()
51 {
52 cudaEventCreate(&start_);
53 cudaEventCreate(&stop_);
54 cudaEventRecord(start_, 0);
55 }
56
57 ~GPUTimer()
58 {
59 cudaEventDestroy(start_);
60 cudaEventDestroy(stop_);
61 }
62
63 void start()
64 {
65 cudaEventRecord(start_, 0);
66 }
67
68 float seconds()
69 {
70 cudaEventRecord(stop_, 0);
71 cudaEventSynchronize(stop_);
72 float time;
73 cudaEventElapsedTime(&time, start_, stop_);
74 return time * 1e-3;
75 }
76 private:
77 cudaEvent_t start_, stop_;
78};
79
80int main()
81{
82 typedef std::complex<float> TypeA;
83 typedef std::complex<float> TypeB;
84 typedef std::complex<float> TypeC;
85 typedef std::complex<float> TypeScalar;
86
87 auto alpha = TypeScalar(1.1, 0.0);
88 auto beta = TypeScalar(0.0, 0.0);
89
90 // CUDA types
91 cutensorDataType_t typeA = CUTENSOR_C_32F;
92 cutensorDataType_t typeB = CUTENSOR_C_32F;
93 cutensorDataType_t typeC = CUTENSOR_C_32F;
94 cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_3XTF32;
95
96
97 /* ***************************** */
98
99 // Create vector of modes
100 std::vector<int> modeC{0,1,2,3,4,6,8,9,25,26,10,12,14,27,15,28,17,19,29,20,21,30,23,24};
101 std::vector<int> modeA{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24};
102 std::vector<int> modeB{25,26,27,28,29,30,5,7,11,13,16,18,22};
103 int nmodeA = modeA.size();
104 int nmodeB = modeB.size();
105 int nmodeC = modeC.size();
106
107 // Extents
108 std::unordered_map<int, int64_t> extent;
109 for (auto i = 0; i <= 30; i++)
110 extent[i] = 2;
111
112 // Create a vector of extents for each tensor
113 std::vector<int64_t> extentC;
114 for (auto mode : modeC)
115 extentC.push_back(extent[mode]);
116 std::vector<int64_t> extentA;
117 for (auto mode : modeA)
118 extentA.push_back(extent[mode]);
119 std::vector<int64_t> extentB;
120 for (auto mode : modeB)
121 extentB.push_back(extent[mode]);
122
123 /**********************
124 * Allocating data
125 **********************/
126
127 // Number of elements of each tensor
128 size_t elementsA = 1;
129 for (auto mode : modeA)
130 elementsA *= extent[mode];
131 size_t elementsB = 1;
132 for (auto mode : modeB)
133 elementsB *= extent[mode];
134 size_t elementsC = 1;
135 for (auto mode : modeC)
136 elementsC *= extent[mode];
137
138 // Size in bytes
139 size_t sizeA = sizeof(TypeA) * elementsA;
140 size_t sizeB = sizeof(TypeB) * elementsB;
141 size_t sizeC = sizeof(TypeC) * elementsC;
142 printf("Total memory: %.2f GiB\n", (sizeA + sizeB + sizeC)/1024./1024./1024);
143
144 // Allocate on device
145 void *A_d, *B_d, *C_d;
146 HANDLE_CUDA_ERROR(cudaMalloc((void**) &A_d, sizeA));
147 HANDLE_CUDA_ERROR(cudaMalloc((void**) &B_d, sizeB));
148 HANDLE_CUDA_ERROR(cudaMalloc((void**) &C_d, sizeC));
149
150 // Allocate on host
151 TypeA *A = (TypeA*) malloc(sizeof(TypeA) * elementsA);
152 TypeB *B = (TypeB*) malloc(sizeof(TypeB) * elementsB);
153 TypeC *C = (TypeC*) malloc(sizeof(TypeC) * elementsC);
154
155 if (A == nullptr || B == nullptr || C == nullptr)
156 {
157 printf("Error: Host allocation of A, B, or C.\n");
158 exit(-1);
159 }
160
161 /*******************
162 * Initialize data
163 *******************/
164
165 for (int64_t i = 0; i < elementsA; i++)
166 A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
167 for (int64_t i = 0; i < elementsB; i++)
168 B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
169 for (int64_t i = 0; i < elementsC; i++)
170 C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
171
172 // Copy to device
173 HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
174 HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
175 HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));
176
177 const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes)
178
179 /*************************
180 * cuTENSOR
181 *************************/
182
183 // Initialize cuTENSOR library
184 cutensorHandle_t handle;
185 HANDLE_ERROR(cutensorCreate(&handle));
186
187 // Read kernel cache from file (if the file was generated by a prior execution)
188 auto readKernelCacheStatus = cutensorReadKernelCacheFromFile(handle, "kernelCache.bin");
189
190 if (readKernelCacheStatus == CUTENSOR_STATUS_IO_ERROR)
191 printf("No kernel cache found. It will be generated before the end of this execution.\n");
192 else if (readKernelCacheStatus == CUTENSOR_STATUS_SUCCESS)
193 printf("Kernel cache found and read successfully.\n");
194 else
195 HANDLE_ERROR(readKernelCacheStatus);
196
197 /**********************
198 * Create Tensor Descriptors
199 **********************/
200
201 cutensorTensorDescriptor_t descA;
202 HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
203 &descA,
204 nmodeA,
205 extentA.data(),
206 NULL,/*stride*/
207 typeA, kAlignment));
208
209 cutensorTensorDescriptor_t descB;
210 HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
211 &descB,
212 nmodeB,
213 extentB.data(),
214 NULL,/*stride*/
215 typeB, kAlignment));
216
217 cutensorTensorDescriptor_t descC;
218 HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
219 &descC,
220 nmodeC,
221 extentC.data(),
222 NULL,/*stride*/
223 typeC, kAlignment));
224
225 /*******************************
226 * Create Contraction Descriptor
227 *******************************/
228
229 cutensorOperationDescriptor_t desc;
230 HANDLE_ERROR(cutensorCreateContraction(handle,
231 &desc,
232 descA, modeA.data(), /* unary operator A*/CUTENSOR_OP_IDENTITY,
233 descB, modeB.data(), /* unary operator B*/CUTENSOR_OP_IDENTITY,
234 descC, modeC.data(), /* unary operator C*/CUTENSOR_OP_IDENTITY,
235 descC, modeC.data(),
236 descCompute));
237
238 /**************************
239 * Set the algorithm to use -- without just-in-time compilation
240 ***************************/
241
242 const cutensorAlgo_t algo = CUTENSOR_ALGO_GETT;
243
244 cutensorPlanPreference_t planPref;
245 HANDLE_ERROR(cutensorCreatePlanPreference(handle,
246 &planPref,
247 algo,
248 CUTENSOR_JIT_MODE_NONE));
249
250 /**********************
251 * Query workspace estimate
252 **********************/
253
254 uint64_t workspaceSizeEstimate = 0;
255 const cutensorWorksizePreference_t workspacePref = CUTENSOR_WORKSPACE_DEFAULT;
256 HANDLE_ERROR(cutensorEstimateWorkspaceSize(handle,
257 desc,
258 planPref,
259 workspacePref,
260 &workspaceSizeEstimate));
261 // Allocate workspace
262 void *work = nullptr;
263 if (workspaceSizeEstimate > 0)
264 {
265 HANDLE_CUDA_ERROR(cudaMalloc(&work, workspaceSizeEstimate));
266 }
267
268 /**************************
269 * Create Contraction Plan -- without just-in-time compilation
270 **************************/
271
272 cutensorPlan_t plan;
273 HANDLE_ERROR(cutensorCreatePlan(handle,
274 &plan,
275 desc,
276 planPref,
277 workspaceSizeEstimate));
278
279 /**********************
280 * Execute the tensor contraction
281 **********************/
282
283 cudaStream_t stream;
284 HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
285
286 double minTimeCUTENSOR = 1e100;
287 for (int i=0; i < 3; ++i)
288 {
289 cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice);
290
291 // Set up timing
292 GPUTimer timer;
293 timer.start();
294
295 HANDLE_ERROR(cutensorContract(handle,
296 plan,
297 (void*) &alpha, A_d, B_d,
298 (void*) &beta, C_d, C_d,
299 work, workspaceSizeEstimate, stream))
300
301 // Synchronize and measure timing
302 auto time = timer.seconds();
303
304 minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
305 }
306
307 /*************************/
308
309 /**************************
310 * Set the algorithm to use -- with just-in-time compilation
311 ***************************/
312
313 cutensorPlanPreference_t planPrefJit;
314 HANDLE_ERROR(cutensorCreatePlanPreference(handle,
315 &planPrefJit,
316 algo,
317 CUTENSOR_JIT_MODE_DEFAULT));
318
319 /**********************
320 * Query workspace estimate
321 **********************/
322
323 uint64_t workspaceSizeEstimateJit = 0;
324 const cutensorWorksizePreference_t workspacePrefJit = CUTENSOR_WORKSPACE_DEFAULT;
325 HANDLE_ERROR(cutensorEstimateWorkspaceSize(handle,
326 desc,
327 planPrefJit,
328 workspacePrefJit,
329 &workspaceSizeEstimateJit));
330 // Allocate workspace
331 void *workJit = nullptr;
332 if (workspaceSizeEstimateJit > 0)
333 {
334 HANDLE_CUDA_ERROR(cudaMalloc(&workJit, workspaceSizeEstimateJit));
335 }
336
337 /**************************
338 * Create Contraction Plan -- with just-in-time compilation
339 **************************/
340
341 cutensorPlan_t planJit;
342 CPUTimer jitPlanTimer;
343 jitPlanTimer.start();
344 // This is where the kernel is actually compiled
345 HANDLE_ERROR(cutensorCreatePlan(handle,
346 &planJit,
347 desc,
348 planPrefJit,
349 workspaceSizeEstimateJit));
350 auto jitPlanTime = jitPlanTimer.seconds();
351
352 /**********************
353 * Execute the tensor contraction using the JIT compiled kernel
354 **********************/
355
356 double minTimeCUTENSORJit = 1e100;
357 for (int i=0; i < 3; ++i)
358 {
359 cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice);
360
361 // Set up timing
362 GPUTimer timer;
363 timer.start();
364
365 HANDLE_ERROR(cutensorContract(handle,
366 planJit,
367 (void*) &alpha, A_d, B_d,
368 (void*) &beta, C_d, C_d,
369 workJit, workspaceSizeEstimateJit, stream))
370
371 // Synchronize and measure timing
372 auto time = timer.seconds();
373
374 minTimeCUTENSORJit = (minTimeCUTENSORJit < time) ? minTimeCUTENSORJit : time;
375 }
376
377 /*************************/
378
379 float flops;
380 HANDLE_ERROR(cutensorOperationDescriptorGetAttribute(handle,
381 desc,
382 CUTENSOR_OPERATION_DESCRIPTOR_FLOPS,
383 (void*)&flops,
384 sizeof(flops)));
385 auto gflops = flops / 1e9;
386 auto gflopsPerSec = gflops / minTimeCUTENSOR;
387 auto gflopsPerSecJit = gflops / minTimeCUTENSORJit;
388
389 printf("cuTENSOR : %6.0f GFLOPs/s\n", gflopsPerSec);
390 printf("cuTENSOR JIT: %6.0f GFLOPs/s\n", gflopsPerSecJit);
391 printf("Speedup: %.1fx\n", gflopsPerSecJit / gflopsPerSec);
392 printf("JIT Compilation time: %.1f seconds ", jitPlanTime / 1e3);
393 if (readKernelCacheStatus == CUTENSOR_STATUS_SUCCESS)
394 printf("(Kernel cache file was read successfully; Compilation was not required)\n");
395 else
396 printf("\n");
397
398 // Write kernel cache to file
399 HANDLE_ERROR(cutensorWriteKernelCacheToFile(handle, "kernelCache.bin"))
400 printf("Kernel cache written to file. Will be read in next execution.\n");
401
402 HANDLE_ERROR(cutensorDestroy(handle));
403 HANDLE_ERROR(cutensorDestroyOperationDescriptor(desc));
404 HANDLE_ERROR(cutensorDestroyTensorDescriptor(descA));
405 HANDLE_ERROR(cutensorDestroyTensorDescriptor(descB));
406 HANDLE_ERROR(cutensorDestroyTensorDescriptor(descC));
407 HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
408 HANDLE_ERROR(cutensorDestroyPlanPreference(planPref));
409 HANDLE_ERROR(cutensorDestroyPlan(plan));
410 HANDLE_ERROR(cutensorDestroyPlanPreference(planPrefJit));
411 HANDLE_ERROR(cutensorDestroyPlan(planJit));
412
413 if (A) free(A);
414 if (B) free(B);
415 if (C) free(C);
416 if (A_d) cudaFree(A_d);
417 if (B_d) cudaFree(B_d);
418 if (C_d) cudaFree(C_d);
419 if (work) cudaFree(work);
420 if (workJit) cudaFree(workJit);
421
422 printf("Successful completion\n");
423 return 0;
424}