快速入门¶

本节包含 cuFFT LTO EA 示例的简化和注释版本，该示例与 zip 文件中的二进制文件一起分发。

该示例在频域中执行多个信号的低通滤波。

/* 
 * Example showing the use of LTO callbacks with CUFFT to perform 
 * R2C -> callback -> C2R.
 * 
*/

#include <vector>
#include <random>
#include <cuda_runtime_api.h>
#include <cufftXt.h>

#define ERROR_VALUE -1
#define PASS_VALUE   0

// Check CUDA API error
inline int checkErrors(cudaError_t error, int line_number) {
	if(error != cudaSuccess) {
		printf("Example failed in CUDA API on line %d with error %d\n", line_number, error);
		return ERROR_VALUE;
	}
	return PASS_VALUE;
}

// Check cuFFT API error
inline int checkErrors(cufftResult error, int line_number) {
	if(error != CUFFT_SUCCESS) {
		printf("Example failed in cuFFT API on line %d with error %d\n", line_number, error);
		return ERROR_VALUE;
	}
	return PASS_VALUE;
}
#define CHECK_ERROR(error) checkErrors(error, __LINE__)

// NOTE: Header containing the compiled LTO callback device function in a C array, generated with bin2c
#include "callback_fatbin.h"

// Struct to pass data to callback
struct cb_params {
  unsigned window_N;
  unsigned signal_size;
};

// Problem input parameters
constexpr unsigned batches              = 830;
constexpr unsigned signal_size          = 328;
constexpr unsigned window_size          =  32;
constexpr unsigned complex_signal_size  = signal_size / 2 + 1;

// Initialize the input signal with random values
void init_input_signals(unsigned batches, unsigned signal_size, float* signals) {

	std::mt19937 e2(0);
	std::uniform_real_distribution<> dist(0., 1.);

	for(unsigned batch = 0; batch < batches; ++batch) {
		for(unsigned s = 0; s < signal_size; ++s) {
			unsigned idx = batch * signal_size + s;
			signals[idx] = dist(e2);
		}
	}
}

int main() {

	// Padded array for in-place transforms
	float  input_signals[batches][2 * complex_signal_size] = {};
	float output_signals[batches][2 * complex_signal_size];
	float      reference[batches][2 * complex_signal_size];

	init_input_signals(batches, 2 * complex_signal_size, &input_signals[0][0]);

	const size_t complex_size_bytes = batches * complex_signal_size * 2 * sizeof(float);

	// Allocate and copy input from host to GPU
	float *device_signals;
	CHECK_ERROR(cudaMalloc((void **)&device_signals, complex_size_bytes));
	CHECK_ERROR(cudaMemcpy(device_signals, input_signals, complex_size_bytes, cudaMemcpyHostToDevice));

	// Define a structure used to pass in the window size
	cb_params host_params;
	host_params.window_N    = window_size;
	host_params.signal_size = complex_signal_size;

	// Allocate and copy callback parameters from host to GPU
	cb_params *device_params;
	CHECK_ERROR(cudaMalloc((void **)&device_params, sizeof(cb_params)));
	CHECK_ERROR(cudaMemcpy(device_params, &host_params, sizeof(cb_params), cudaMemcpyHostToDevice));

	// Create a CUFFT plan for the forward transform, and a cuFFT plan for the inverse transform with load callback
	cufftHandle forward_plan, inverse_plan_cb;
	size_t work_size;

	CHECK_ERROR(cufftCreate(&forward_plan));
	CHECK_ERROR(cufftCreate(&inverse_plan_cb));

	// NOTE: LTO callbacks must be set before plan creation and cannot be unset (yet)
	size_t lto_callback_fatbin_size = sizeof(window_callback);
	CHECK_ERROR(cufftXtSetJITCallback(inverse_plan_cb, (void*)window_callback, lto_callback_fatbin_size,
				          CUFFT_CB_LD_COMPLEX, (void **)&device_params));

	CHECK_ERROR(cufftMakePlan1d(forward_plan, signal_size, CUFFT_R2C, batches, &work_size));
	CHECK_ERROR(cufftMakePlan1d(inverse_plan_cb, signal_size, CUFFT_C2R, batches, &work_size));

	// Transform signal forward
	printf("Transforming signal cufftExecR2C\n");
	CHECK_ERROR(cufftExecR2C(forward_plan,    (cufftReal *)device_signals, (cufftComplex *)device_signals));

	// Apply window via load callback and inverse-transform the signal
	printf("Transforming signal cufftExecC2R\n");
	CHECK_ERROR(cufftExecC2R(inverse_plan_cb, (cufftComplex *)device_signals, (cufftReal *)device_signals));

	// Copy device memory to host
	CHECK_ERROR(cudaMemcpy(output_signals, device_signals, complex_size_bytes, cudaMemcpyDeviceToHost));

	// Destroy CUFFT context
	CHECK_ERROR(cufftDestroy(forward_plan));
	CHECK_ERROR(cufftDestroy(inverse_plan_cb));

	// Cleanup memory
	CHECK_ERROR(cudaFree(device_signals));
	CHECK_ERROR(cudaFree(device_params));

	return PASS_VALUE;
}

具体来说，它执行以下操作

它分配并初始化输入：batches 个大小为 signal_size 的信号用随机值初始化。
它将输入数据复制到 GPU。
它创建一个正向（R2C，实数到复数）计划和一个反向（C2R，复数到实数）计划。
在计划创建后（使用 cufftCreate(…)），但在调用计划函数之前，它使用 cuFFT API 的扩展 cufftXtSetJITCallback(…) 将包含 fatbin 的数组与回调函数关联到计划。
它为两个计划调用计划函数 (cufftMakePlan1d(…))。
它执行正向计划。
它执行反向计划。此计划运行加载回调，该回调将低通滤波器实现为窗口函数。
它将结果复制回主机。
它使用 cufftDestroy(…) 销毁计划，并释放 GPU 资源。

回调设备函数位于一个单独的源文件中，如下所示

/* 
 * Example showing the use of LTO callbacks with CUFFT to perform 
 * truncation with zero padding.
 * 
*/

#include <cufftXt.h>

struct cb_params {
	unsigned window_N;
	unsigned signal_size;
};

// This is the load callback routine. It filters high frequencies
// based on a truncation window specified by the user
// NOTE: unlike the non-LTO version, the callback device function
// must have the name cufftJITCallbackLoadComplex, it cannot be aliased
__device__ cufftComplex cufftJITCallbackLoadComplex(void *input,
                                                    size_t index,
                                                    void *info,
                                                    void *sharedmem) {
  
	const cb_params* params = static_cast<const cb_params*>(info);
	cufftComplex* cb_output = static_cast<cufftComplex*>(input);
	const unsigned sample   = index % params->signal_size;

	return (sample < params->window_N) ? cb_output[index] : cufftComplex{0.f, 0.f};
}

当命名 cuFFT LTO EA 中的 LTO 回调函数时，有一些限制。有关更多详细信息，请参阅此处。

在编译示例之前，我们需要将 tar 包中包含的库文件和头文件复制到 CUDA Toolkit 文件夹中。

$ cp nvidia-cufft-11.1.0-Linux/opt/cufft/include/* /path/to/cuda/toolkit/include
$ cp nvidia-cufft-11.1.0-Linux/opt/cufft/lib/* /path/to/cuda/toolkit/lib64

然后可以像这样编译和运行示例

$ nvcc --std=c++11 --generate-code arch=compute_50,code=lto_50 -dc -fatbin callback.cu -o callback.fatbin
$ bin2c --name window_callback --type longlong callback.fatbin > callback_fatbin.h
$ g++ -I /path/to/cuda/toolkit/include -L /path/to/cuda/toolkit/lib64 lto_ea.cpp -o lto_ea -lcufft -lcudart
$ ./lto_ea
Transforming signal cufftExecR2C
Transforming signal cufftExecC2R