执行方法#

执行方法用于运行用户使用 cuBLASDx 运算符定义的 BLAS 函数。

注意

目前,cuBLASDx 仅支持在 CUDA 线程块级别(块执行)上执行。

块执行方法#

如果描述符是使用 块运算符 构建的,并且 is_complete_blas_execution 特性true,则可以使用块执行方法。

共享内存 API#

方法 execute(...) 运行由 BLAS 描述符定义的计算,接受三种类型的参数。

using BLAS = decltype(cublasdx::Size<M, N, K>() + ...);

// #1 - Tensor API

template<class Alpha,                         // Must be convertible to BLAS::c_value_type
         class AEngine, class ALayout,        // Types derived from pointer and layout used to create tensor_a
         class BEngine, class BLayout,        // Types derived from pointer and layout used to create tensor_b
         class Beta,                          // Must be convertible to BLAS::c_value_type
         class CEngine, class CLayout,        // Types derived from pointer and layout used to create tensor_c
         class ALoadOp = cublasdx::identity,  // Transform operation applied when data is loaded from matrix A
         class BLoadOp = cublasdx::identity,  // Transform operation applied when data is loaded from matrix B
         class CLoadOp = cublasdx::identity,  // Transform operation applied when data is loaded from matrix C
         class CStoreOp = cublasdx::identity> // Transform operation applied when data is store to matrix C
inline __device__ void execute(const Alpha&                               alpha,
                               const cublasdx::tensor<AEngine, ALayout>&  tensor_a,
                               const cublasdx::tensor<BEngine, BLayout>&  tensor_b,
                               const Beta&                                beta,
                               cublasdx::tensor<CEngine, CLayout>&        tensor_c,
                               const ALoadOp&                             a_load_op  = {},
                               const BLoadOp&                             b_load_op  = {},
                               const CLoadOp&                             c_load_op  = {},
                               const CStoreOp&                            c_store_op = {})


// #2 - Pointer API
template<
  class Alpha, // Must be convertible to BLAS::c_value_type
  class TA,    // Value type of matrix A
  class TB,    // Value type of matrix B
  class Beta,  // Must be convertible to BLAS::c_value_type
  class TC,    // Value type of matrix C
  class ALoadTransformOp  = cublasdx::identity, // Transform operation applied when data is loaded from matrix A
  class BLoadTransformOp  = cublasdx::identity, // Transform operation applied when data is loaded from matrix B
  class CLoadTransformOp  = cublasdx::identity, // Transform operation applied when data is loaded from matrix C
  class CStoreTransformOp = cublasdx::identity> // Transform operation applied when data is store to matrix C
 inline __device__ void execute(const Alpha     alpha,
                                TA*             matrix_a,
                                TB*             matrix_b,
                                const Beta      beta,
                                TC*             matrix_c,
 inline __device__ auto execute(const Alpha              alpha,
                                TA*                      matrix_a,
                                TB*                      matrix_b,
                                const Beta               beta,
                                TC*                      matrix_c,
                                const ALoadTransformOp&  a_load_op  = {},
                                const BLoadTransformOp&  b_load_op  = {},
                                const CLoadTransformOp&  c_load_op  = {},
                                const CStoreTransformOp& c_store_op = {})

// #3 - Pointer API, which allows providing runtime/dynamic leading dimensions for matrices A, B, and C
template<
  class Alpha,
  class TA,
  class TB,
  class Beta,
  class TC,
  class ALoadTransformOp  = cublasdx::identity,
  class BLoadTransformOp  = cublasdx::identity,
  class CLoadTransformOp  = cublasdx::identity,
  class CStoreTransformOp = cublasdx::identity>
inline __device__ void BLAS::execute(const Alpha&             alpha,
                                     TA*                      matrix_a,
                                     const unsigned int       lda,
                                     TB*                      matrix_b,
                                     const unsigned int       ldb,
                                     const Beta&              beta,
                                     TC*                      matrix_c,
                                     const unsigned int       ldc,
                                     const ALoadTransformOp&  a_load_op  = {},
                                     const BLoadTransformOp&  b_load_op  = {},
                                     const CLoadTransformOp&  c_load_op  = {},
                                     const CStoreTransformOp& c_store_op = {})

方法 #1 接受 cublasdx::tensor 作为矩阵 ABC 的共享内存存储表示。cublasdx::tensor 本质上是 CuTe 张量 (cute::Tensor),一种多维数组的表示,具有丰富的功能,抽象化了数组元素如何在内存中组织和存储的细节。

请参阅 cuBLASDx 张量张量创建获取内存布局,了解如何使用原始内存指针和 CuTe 布局创建张量,以及在需要时动态定义的引导维度。如果需要,用户可以传递具有自定义布局的张量。

在方法 #2 和 #3 中,指针 matrix_amatrix_bmatrix_c 必须指向与 BLAS::<a/b/c>_alignment 对齐的共享内存区域。如果未使用 对齐 运算符,则 BLAS::<a/b/c>_alignment 等于 alignof(BLAS::<a/b/c>_value_type)

方法 #2 和 #3 假定每个矩阵的布局都与其设置的排列方式相对应,即,如果在 BLAS 描述中使用了 Arrangement<col_major, row_major, col_major>,则 A 矩阵应为列优先,B 矩阵应为行优先,C 矩阵应为列优先。默认排列方式对应于使用 Arrangement<row_major, col_major, col_major>

方法 #3 允许用户通过 ldaldbldc 参数提供自定义的动态引导维度。在这种情况下,通过 引导维度 运算符设置的引导维度值将被忽略。ldaldbldc 的值必须遵循与 引导维度 运算符中相同的规则。

在执行函数之后,用户必须在访问 ABC 之前执行 CUDA 块同步。

下面的代码示例演示了如何使用三种 execute(...) 方法。

#include <cublasdx.hpp>

using GEMM = decltype(cublasdx::Size<32, 32, 32>()
              + cublasdx::Precision<cublasdx::tfloat32_t, cublasdx::tfloat32_t, float>()
              + cublasdx::Type<cublasdx::type::real>()
              + cublasdx::Arrangement<cublasdx::row_major, cublasdx::col_major>()
              + cublasdx::Function<cublasdx::function::MM>()
              + cublasdx::MaxAlignment() // max alignment (16, 16, 16) is the default
              + cublasdx::SM<800>()
              + cublasdx::Block());

using a_data_type = typename GEMM::a_value_type;
using b_data_type = typename GEMM::b_value_type;
using c_data_type = typename GEMM::c_value_type;

extern __shared__ __align__(16) char smem[];

// smem_<a/b/c> are aligned to cublasdx::alignment_of<GEMM>::<a/b/c>
auto [smem_a, smem_b, smem_c] = cublasdx::slice_shared_memory<GEMM>(smem);

//*********** Method #1, using cublasdx tensor APIs
{
    // Make global memory tensor
    auto a_global_tensor = cublasdx::make_tensor(a, GEMM::get_layout_gmem_a());
    auto b_global_tensor = cublasdx::make_tensor(b, GEMM::get_layout_gmem_b());
    auto c_global_tensor = cublasdx::make_tensor(c, GEMM::get_layout_gmem_c());

    // Make shared memory tensor
    auto a_shared_tensor = cublasdx::make_tensor(smem_a, GEMM::get_layout_smem_a());
    auto b_shared_tensor = cublasdx::make_tensor(smem_b, GEMM::get_layout_smem_b());
    auto c_shared_tensor = cublasdx::make_tensor(smem_c, GEMM::get_layout_smem_c());

    // Load data from global to shared memory using cublasdx::copy API
    using alignment = cublasdx::alignment_of<GEMM>;
    cublasdx::copy<GEMM, alignment::a>(a_global_tensor, a_shared_tensor);
    cublasdx::copy<GEMM, alignment::b>(b_global_tensor, b_shared_tensor);
    cublasdx::copy<GEMM, alignment::c>(c_global_tensor, c_shared_tensor);
    cublasdx::copy_wait();

    // Execute
    GEMM().execute(alpha, a_shared_tensor, b_shared_tensor, beta, c_shared_tensor);
    __syncthreads();

    // Store back to global memory using cublasdx::copy API
    cublasdx::copy<GEMM, alignment::c>(c_shared_tensor, c_global_tensor);
    cublasdx::copy_wait(); // Needed to ensure c_global_tensor has a defined state and data in it can be used for any following operations in the kernel. If there are no further instruction a kernel's finalization will be the final synchronization point.
}

//*********** Method #1, cublasdx tensor APIs, with dynamic leading dimensions
{
    // Make global memory tensor
    auto a_global_tensor = cublasdx::make_tensor(a, GEMM::get_layout_gmem_a(lda));
    auto b_global_tensor = cublasdx::make_tensor(b, GEMM::get_layout_gmem_b(ldb));
    auto c_global_tensor = cublasdx::make_tensor(c, GEMM::get_layout_gmem_c(ldb));

    // Make shared memory tensor
    auto a_shared_tensor = cublasdx::make_tensor(smem_a, GEMM::get_layout_smem_a(lda));
    auto b_shared_tensor = cublasdx::make_tensor(smem_b, GEMM::get_layout_smem_b(ldb));
    auto c_shared_tensor = cublasdx::make_tensor(smem_c, GEMM::get_layout_smem_c(ldc));

    // Load data from global to shared memory using cublasdx::copy API
    using alignment = cublasdx::alignment_of<GEMM>;
    cublasdx::copy<GEMM, alignment::a>(a_global_tensor, a_shared_tensor);
    cublasdx::copy<GEMM, alignment::b>(b_global_tensor, b_shared_tensor);
    cublasdx::copy<GEMM, alignment::c>(c_global_tensor, c_shared_tensor);
    cublasdx::copy_wait();

    // Execute
    GEMM().execute(alpha, a_shared_tensor, b_shared_tensor, beta, c_shared_tensor);
    __syncthreads();

    // Store back to global memory using cublasdx::copy API
    cublasdx::copy<GEMM, alignment::c>(c_shared_tensor, c_global_tensor);
    cublasdx::copy_wait(); // Only needed if more operations on shared memory used in c_shared_tensor happens in the kernel
}

//*********** Method #2, using raw share memory pointers
{
    // User code to load data from global to shared memory
    // smem_a <-- a, smem_b <-- b, smem_c <-- c

    // Execute
    GEMM().execute(alpha, smem_a, smem_b, beta, smem_c);
    __syncthreads();

    // User code to store back to global memory
    // smem_c --> c
}

  //*********** Method #3, with dynamic leading dimensions
{
    // User code to load data from global to shared memory
    // smem_a <-- a, smem_b <-- b, smem_c <-- c

    // Execute
    GEMM().execute(alpha, smem_a, lda, smem_b, ldb, beta, smem_c, ldc);
    __syncthreads();

    // User code to store back to global memory
    // smem_c --> c
}

寄存器 API#

方法 execute(...) 运行由 BLAS 描述符定义的计算,接受两种类型的参数。

using BLAS = decltype(cublasdx::Size<M, N, K>() + ...);

// #1 - Registers with accumulator API

template<class AEngine, class ALayout,       // Types derived from pointer and layout used to create tensor_a
         class BEngine, class BLayout,       // Types derived from pointer and layout used to create tensor_b
         class CEngine, class CLayout,       // Types derived from pointer and layout used to create tensor_c
         class ALoadOp = cublasdx::identity, // Transform operation applied when data is loaded from matrix A
         class BLoadOp = cublasdx::identity> // Transform operation applied when data is loaded from matrix B
inline __device__ void execute(const cublasdx::tensor<AEngine, ALayout>&  tensor_a,
                               const cublasdx::tensor<BEngine, BLayout>&  tensor_b,
                               cublasdx::tensor<CEngine, CLayout>      &  tensor_c,
                               const ALoadOp&                             a_load_op  = {},
                               const BLoadOp&                             b_load_op  = {})

// #2 - Registers without accumulator API

template<class AEngine, class ALayout,
         class BEngine, class BLayout,
         class ALoadOp = cublasdx::identity,
         class BLoadOp = cublasdx::identity>
inline __device__ auto execute(const cublasdx::tensor<AEngine, ALayout>&  tensor_a,
                               const cublasdx::tensor<BEngine, BLayout>&  tensor_b,
                               const ALoadOp&                             a_load_op  = {},
                               const BLoadOp&                             b_load_op  = {})

方法 #1 接受用于 AB 矩阵的共享内存张量,但接受用于 C 矩阵的寄存器片段。它不返回任何内容,因为它会将 AB 相乘的结果添加到 C,从而得到

\(\mathbf{C}_{m\times n} = \mathbf{A}_{m\times k} \times \mathbf{B}_{k\times n} + \mathbf{C}_{m\times n}\)

寄存器片段必须预先存在,可以来自方法 #2 的先前执行,或者从分区器对象创建(请参阅 数据分区器)。它必须与它所用于的 GEMM 的精度和分区完全匹配。

方法 #2 仅接受用于 AB 矩阵的共享内存张量。它返回一个不透明的元组,其中包含结果寄存器片段及其对应的分区器。结果对应于

\(\mathbf{C}_{m\times n} = \mathbf{A}_{m\times k} \times \mathbf{B}_{k\times n}\)

C++17 结构化绑定可用于清晰地检索这两个值

auto [c_register_fragment, partitioner] = BLAS().execute(a_shared_tensor, b_shared_tensor, a_load_op, b_load_op);

下面的代码示例演示了如何使用两种 execute(...) 方法。

#include <cublasdx.hpp>

using GEMM = decltype(cublasdx::Size<32, 32, 32>()
              + cublasdx::Precision<cublasdx::tfloat32_t, cublasdx::tfloat32_t, float>()
              + cublasdx::Type<cublasdx::type::real>()
              + cublasdx::Arrangement<cublasdx::row_major, cublasdx::col_major>()
              + cublasdx::Function<cublasdx::function::MM>()
              + cublasdx::MaxAlignment() // max alignment (16, 16, 16) is the default
              + cublasdx::SM<800>()
              + cublasdx::Block());

using a_data_type = typename GEMM::a_value_type;
using b_data_type = typename GEMM::b_value_type;
using c_data_type = typename GEMM::c_value_type;

extern __shared__ __align__(16) char smem[];

// smem_<a/b> are aligned to cublasdx::alignment_of<GEMM>::<a/b>
auto [smem_a, smem_b] = cublasdx::slice_shared_memory_ab<GEMM>(smem);

//*********** Method #1, register API with accumulator
{
    // Make global memory tensor
    auto a_global_tensor = cublasdx::make_tensor(a, GEMM::get_layout_gmem_a());
    auto b_global_tensor = cublasdx::make_tensor(b, GEMM::get_layout_gmem_b());
    auto c_global_tensor = cublasdx::make_tensor(c, GEMM::get_layout_gmem_c());

    // Make shared memory tensor
    auto a_shared_tensor = cublasdx::make_tensor(smem_a, GEMM::get_layout_smem_a());
    auto b_shared_tensor = cublasdx::make_tensor(smem_b, GEMM::get_layout_smem_b());

    // Load data from global to shared memory using cublasdx::copy API
    using alignment = cublasdx::alignment_of<GEMM>;
    cublasdx::copy<GEMM, alignment::a>(a_global_tensor, a_shared_tensor);
    cublasdx::copy<GEMM, alignment::b>(b_global_tensor, b_shared_tensor);
    cublasdx::copy_wait();

    // Execute
    auto partitioner = GEMM::get_partitioner();
    auto c_register_fragment = partitioner.make_accumulator_fragment();
    cublasdx::copy_fragment<alignment::c>(c_global_tensor, c_register_fragment, partitioner);

    GEMM().execute(a_shared_tensor, b_shared_tensor, c_register_fragment);

    // Store back to global memory using cublasdx::copy_fragment API
    cublasdx::copy_fragment<alignment::c>(c_register_fragment, c_global_tensor, partitioner);
}

//*********** Method #2, cublasdx tensor APIs, without accumulator
{
    // Make global memory tensor
    auto a_global_tensor = cublasdx::make_tensor(a, GEMM::get_layout_gmem_a());
    auto b_global_tensor = cublasdx::make_tensor(b, GEMM::get_layout_gmem_b());
    auto c_global_tensor = cublasdx::make_tensor(c, GEMM::get_layout_gmem_c());

    // Make shared memory tensor
    auto a_shared_tensor = cublasdx::make_tensor(smem_a, GEMM::get_layout_smem_a());
    auto b_shared_tensor = cublasdx::make_tensor(smem_b, GEMM::get_layout_smem_b());

    // Load data from global to shared memory using cublasdx::copy API
    using alignment = cublasdx::alignment_of<GEMM>;
    cublasdx::copy<GEMM, alignment::a>(a_global_tensor, a_shared_tensor);
    cublasdx::copy<GEMM, alignment::b>(b_global_tensor, b_shared_tensor);
    cublasdx::copy_wait();

    // Execute
    auto [c_register_fragment, partitioner] = GEMM().execute(a_shared_tensor, b_shared_tensor);

    // Store back to global memory using cublasdx::copy_fragment API
    cublasdx::copy_fragment<alignment::c>(c_register_fragment, c_global_tensor, partitioner);
}

输入数据属性#

注意

从 cuBLASDx 0.3.0 开始,计算精度已与数据精度解耦,即,每个矩阵的输入/输出数据可以是任意类型(即使是浮点 GEMM 的整数输入),前提是设置了 对齐运算符 并且满足以下至少一个条件

  1. 它可以隐式转换为使用 精度运算符类型运算符 选择的数据类型。

  2. 对于输入:提供了一个适当的转换加载操作作为参数之一。它接受输入类型值。其结果必须至少可以隐式转换为计算类型。

  3. 对于输出:提供了一个适当的转换存储操作作为参数之一。它接受结果计算类型(通常是由 精度运算符类型运算符 定义的 C 类型)。其结果必须至少可以隐式转换为输出类型。

警告

如果使用与输入类型解耦的计算精度,则必须显式设置 对齐运算符

标量(alphabeta)的底层元素类型默认假定为 BLAS::c_value_type,但它们可以是任何类型,只要

  1. 它们的对齐方式和大小与 BLAS::c_value_type 相同,并且

  2. 它们可以转换为 BLAS::c_value_type

转换操作输入#

所有方法都接受转换仿函数。a_load_opb_load_opc_load_op 在从每个矩阵读取元素时应用,c_store_op 在矩阵乘法的结果存储在 C 矩阵之前应用。每个仿函数都必须表示一个元素级转换,它

  1. 对于加载转换:接受 execute(...) 方法的相应输入类型,并返回可隐式转换为 BLAS::<a/b/c>_value_type 类型的值。

  2. 对于存储转换:接受 BLAS::<a/b/c>_value_type 并返回 execute(...) 方法的相应输入类型。

示例

using GEMM = decltype(Size<128, 128, 128>() + Type<type::real>() + Precision<float, float, double>() + Block() + ...);

struct multiple_by_2 {
  template<class T>
  __device__ constexpr T operator()(const T arg) const {
    return arg * static_cast<T>(2.0f);
  }
};

struct negate {
  template <class T>
  __device__ constexpr T operator()(const T arg) const {
    return -arg;
  }
};

GEMM().execute(..., multiple_by_2{}, cublasdx::conjugate{}, cublasdx::identity{}, negate{});

警告

不保证使用完全相同的输入但使用不同的

执行完全相同的 BLAS 函数将产生位相同的結果。

警告

不保证在不同 CUDA 架构的 GPU 上使用完全相同的输入执行完全相同的 BLAS 函数将产生位相同的結果。

值格式#

BLAS::a_value_type
BLAS::b_value_type
BLAS::c_value_type

对于每种精度的复数,复数中的第一个值是实部,第二个值是虚部。对于实数,BLAS::<a/b/c>_value_type 与用于描述 BLASPrecision<PA, PB, PC> 中的 P 相同(或默认精度)。

输入/输出数据格式#

本节介绍正确计算所需的输入和输出数据格式(布局)。

GEMM (function::MM)#

用于通用矩阵乘法的张量 API(execute() 方法,它期望使用 cublasdx::tensor 表示的矩阵)接受由具有任意布局的张量表示的矩阵。由于张量对象携带有关矩阵的维度、内存位置和布局的所有信息,因此不需要其他隐式假设。矩阵的维度必须与 大小 运算符定义的维度匹配。另请参阅 获取内存布局建议的共享内存布局 部分。

用于通用矩阵乘法的指针 API(execute() 的 #2 和 #3 重载)假定输入矩阵 matrix_amatrix_bmatrix_c 中的值按照添加到描述中的 排列 运算符定义的方式存储(默认情况下,matrix_a 为行优先格式,matrix_b 为列优先格式,matrix_c 为列优先格式)。

共享内存使用#

重要的是要注意,大型 BLAS 操作(由 大小 运算符定义)可能需要每个 CUDA 块超过 48 KB 的共享内存用于矩阵。因此,如 CUDA 编程指南#1, #2, #3)中所述,具有此类 BLAS 操作的内核必须使用动态共享内存,而不是静态大小的共享内存数组。此外,这些内核需要使用 cudaFuncSetAttribute() 显式选择加入,以设置 cudaFuncAttributeMaxDynamicSharedMemorySize。请参阅下面的示例代码。

#include <cublasdx.hpp>
using namespace cublasdx;

using GEMM = decltype(cublasdx::Size<128, 128, 64>()
              + cublasdx::Precision<__nv_fp8_e4m3, __nv_fp8_e5m2, float>()
              + cublasdx::Type<cublasdx::type::real>()
              + cublasdx::Arrangement<cublasdx::row_major, cublasdx::col_major>()
              + cublasdx::Function<cublasdx::function::MM>()
              + cublasdx::SM<900>()
              + cublasdx::Block());

void example() {
  (...)

  // Get required shared memory sizes, options:

  // Shared Memory API
  // 1 - Shared memory size required for matrices based on GEMM definition
  auto shared_memory_size = cublasdx::get_shared_storage_size<GEMM>();
  // 2 - Shared memory size when dynamic leading dimensions are used
  auto shared_memory_size = cublasdx::get_shared_storage_size<GEMM>(lda, ldb, ldc);
  // 3 - Shared memory size calculated based on custom matrix layouts for A, B, C matrices
  auto shared_memory_size = cublasdx::get_shared_storage_size<GEMM>(matrix_a_layout, matrix_b_layout, matrix_c_layout);

  // Register API
  // 1 - Shared memory size required for matrices based on GEMM definition
  auto shared_memory_size = cublasdx::get_shared_storage_size_ab<GEMM>();
  // 2 - Shared memory size when dynamic leading dimensions are used
  auto shared_memory_size = cublasdx::get_shared_storage_size_ab<GEMM>(lda, ldb);
  // 3 - Shared memory size calculated based on custom matrix layouts for A, B matrices
  auto shared_memory_size = cublasdx::get_shared_storage_size_ab<GEMM>(matrix_a_layout, matrix_b_layout);

  // Increases the max dynamic shared memory size to match GEMM requirements
  cudaFuncSetAttribute(gemm_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)
  // Invokes kernel with GEMM::block_dim threads in CUDA block
  gemm_kernel<GEMM><<<1, GEMM::block_dim, shared_memory_size>>>(alpha, a, b, beta, c);

  (...)
}