documentation/reference/cuda__gemm_8h_source.html

//   OpenNN: Open Neural Networks Library

//   www.opennn.net

//

//   C U D A   G E M M   M O D U L E

//

//   Artificial Intelligence Techniques SL

//   artelnics@artelnics.com


#pragma once


#include "tensor_utilities.h"


namespace opennn

{


#ifdef OPENNN_HAS_CUDA


struct LtMatmulPlan

{

    cublasLtMatmulDesc_t   op_desc = nullptr;

    cublasLtMatrixLayout_t a_desc  = nullptr;

    cublasLtMatrixLayout_t b_desc  = nullptr;

    cublasLtMatrixLayout_t c_desc  = nullptr;

    cublasLtMatrixLayout_t d_desc  = nullptr;

    cublasLtMatmulAlgo_t   algo{};

    bool                   algo_valid = false;

    size_t                 workspace_size = 0;  // bytes the chosen algo actually needs


    LtMatmulPlan() = default;

    LtMatmulPlan(const LtMatmulPlan&) = delete;

    LtMatmulPlan& operator=(const LtMatmulPlan&) = delete;

    LtMatmulPlan(LtMatmulPlan&& other) noexcept { *this = move(other); }

    LtMatmulPlan& operator=(LtMatmulPlan&& other) noexcept

    {

        swap(op_desc, other.op_desc);

        swap(a_desc,  other.a_desc);

        swap(b_desc,  other.b_desc);

        swap(c_desc,  other.c_desc);

        swap(d_desc,  other.d_desc);

        swap(algo,    other.algo);

        swap(algo_valid, other.algo_valid);

        swap(workspace_size, other.workspace_size);

        return *this;

    }

    ~LtMatmulPlan()

    {

        cublasLtMatrixLayoutDestroy(d_desc);

        cublasLtMatrixLayoutDestroy(c_desc);

        cublasLtMatrixLayoutDestroy(b_desc);

        cublasLtMatrixLayoutDestroy(a_desc);

        cublasLtMatmulDescDestroy(op_desc);

    }

};


struct LtMatmulPlanKey

{

    int m;

    int n;

    int k;

    int transA;

    int transB;

    int epilogue;   // cublasLtEpilogue_t cast to int (e.g. BIAS, RELU_BIAS, BGRADA)

    int io_dtype;   // cudaDataType_t for A and B (inputs)

    int out_dtype;  // cudaDataType_t for C and D (outputs)


    bool operator==(const LtMatmulPlanKey& other) const noexcept

    {

        return m == other.m && n == other.n && k == other.k

            && transA == other.transA && transB == other.transB

            && epilogue == other.epilogue

            && io_dtype == other.io_dtype && out_dtype == other.out_dtype;

    }

};


struct LtMatmulPlanKeyHash

{

    size_t operator()(const LtMatmulPlanKey& key) const noexcept

    {

        return hash_combine(key.m, key.n, key.k,

                            key.transA, key.transB, key.epilogue,

                            key.io_dtype, key.out_dtype);

    }

};


// Upper bound passed to cuBLASLt's heuristic search — limits which algorithms

// are considered. The actual VRAM allocated only grows to the max workspace

// the chosen algorithms reported they need (see ensure_cublas_lt_workspace).

constexpr size_t cublas_lt_workspace_search_bytes() { return 32ull * 1024 * 1024; }


// Grows the global cublasLt scratch buffer to at least `min_bytes`. Returns a

// pointer to it. Initial size is 0 — the buffer only grows when a plan whose

// chosen algorithm needs more workspace gets created.

void* ensure_cublas_lt_workspace(size_t min_bytes = 0);


__nv_bfloat16* ensure_bf16_input_scratch(Index n_elements);


__nv_bfloat16* ensure_bf16_gradient_scratch(Index n_elements);


float* ensure_fp32_upcast_scratch(Index n_elements);


float* get_loss_scratch(Index n_elements);


const void* maybe_cast(const TensorView& input, Type target_type);


const LtMatmulPlan& get_lt_gemm_plan(

    int m, int n, int k,

    cublasOperation_t transA,

    cublasOperation_t transB,

    cublasLtEpilogue_t epilogue,

    cudaDataType_t io_dtype  = CUDA_R_32F,

    cudaDataType_t out_dtype = CUDA_R_32F);


inline void gemm_cuda(cublasOperation_t transa, cublasOperation_t transb,

                      int m, int n, int k,

                      const void* A, cudaDataType_t Atype, int lda,

                      const void* B, cudaDataType_t Btype, int ldb,

                      void* C, cudaDataType_t Ctype, int ldc,

                      float alpha = 1.0f, float beta = 0.0f)

{

    // CUBLAS_COMPUTE_32F_FAST_TF32 is FP32-input only; for BF16 use plain CUBLAS_COMPUTE_32F.

    const cublasComputeType_t compute = (Atype == CUDA_R_16BF || Btype == CUDA_R_16BF)

                                            ? CUBLAS_COMPUTE_32F

                                            : CUBLAS_COMPUTE_DTYPE;

    CHECK_CUBLAS(cublasGemmEx(Backend::get_cublas_handle(),

                              transa, transb,

                              m, n, k,

                              &alpha,

                              A, Atype, lda,

                              B, Btype, ldb,

                              &beta,

                              C, Ctype, ldc,

                              compute,

                              CUBLAS_GEMM_DEFAULT));

}


inline void gemm_strided_batched_cuda(cublasOperation_t transa, cublasOperation_t transb,

                                      int m, int n, int k,

                                      const void* A, int lda, long long stride_a,

                                      const void* B, int ldb, long long stride_b,

                                      void* C, int ldc, long long stride_c,

                                      int batch_count,

                                      cudaDataType_t io_dtype = CUDA_R_32F,

                                      float alpha = 1.0f, float beta = 0.0f)

{

    const cublasComputeType_t compute = (io_dtype == CUDA_R_16BF)

                                            ? CUBLAS_COMPUTE_32F

                                            : CUBLAS_COMPUTE_DTYPE;

    CHECK_CUBLAS(cublasGemmStridedBatchedEx(Backend::get_cublas_handle(),

                                            transa, transb,

                                            m, n, k,

                                            &alpha,

                                            A, io_dtype, lda, stride_a,

                                            B, io_dtype, ldb, stride_b,

                                            &beta,

                                            C, io_dtype, ldc, stride_c,

                                            batch_count,

                                            compute,

                                            CUBLAS_GEMM_DEFAULT));

}


#endif // OPENNN_HAS_CUDA


}


// OpenNN: Open Neural Networks Library.

// Copyright(C) 2005-2026 Artificial Intelligence Techniques, SL.

// Licensed under the GNU Lesser General Public License v2.1 or later.

opennn::Backend::get_cublas_handle
static cublasHandle_t get_cublas_handle()
Definition tensor_utilities.h:403

opennn::opennn::hash_combine
size_t hash_combine(const Vs &... values)
Definition neural_network.h:388

opennn
Definition adaptive_moment_estimation.h:19

opennn::cublasLtEpilogue_t
cublasLtEpilogue_t
Definition neural_network.h:85

opennn::CUBLAS_COMPUTE_DTYPE
constexpr cublasComputeType_t CUBLAS_COMPUTE_DTYPE
Definition tensor_utilities.h:43

opennn::Type
Type
Definition configuration.h:18

opennn::cudaDataType_t
cudaDataType_t
Definition neural_network.h:82

opennn::CUDA_R_16BF
@ CUDA_R_16BF
Definition neural_network.h:82

opennn::CUDA_R_32F
@ CUDA_R_32F
Definition neural_network.h:82

opennn::cublasComputeType_t
cublasComputeType_t
Definition neural_network.h:83

opennn::CUBLAS_COMPUTE_32F
@ CUBLAS_COMPUTE_32F
Definition neural_network.h:83

opennn::cublasOperation_t
cublasOperation_t
Definition neural_network.h:84

opennn::TensorView
Definition tensor_utilities.h:236

opennn::__nv_bfloat16
Definition neural_network.h:78

tensor_utilities.h