documentation/reference/tensor__utilities_8h_source.html

//   OpenNN: Open Neural Networks Library

//   www.opennn.net

//

//   T E N S O R   U T I L I T I E S   C L A S S   H E A D E R

//

//   Artificial Intelligence Techniques SL

//   artelnics@artelnics.com


#pragma once


#include "pch.h"

#include "configuration.h"


namespace opennn

{


static constexpr Index ALIGN_BYTES = EIGEN_MAX_ALIGN_BYTES;

static constexpr Index ALIGN_ELEMENTS = ALIGN_BYTES / sizeof(float);


[[nodiscard]] inline int to_int(Index value) { return static_cast<int>(value); }

[[nodiscard]] inline float to_type(Index value) { return static_cast<float>(value); }


[[nodiscard]] inline Index align_up(Index value, Index alignment)

{

    return value == 0 ? 0 : (value + alignment - 1) & ~(alignment - 1);

}


[[nodiscard]] inline Index get_aligned_size(Index size)     { return align_up(size,    ALIGN_ELEMENTS); }

[[nodiscard]] inline Index get_aligned_bytes(Index n_bytes) { return align_up(n_bytes, ALIGN_BYTES); }

[[nodiscard]] inline Index get_aligned_bytes(Index count, Type dtype) { return get_aligned_bytes(count * type_bytes(dtype)); }


[[nodiscard]] inline bool is_aligned(const void* ptr)

{

    return reinterpret_cast<uintptr_t>(ptr) % ALIGN_BYTES == 0;

}


constexpr cudaDataType_t      CUDA_REDUCTION_DTYPE   = CUDA_R_32F;

constexpr cublasComputeType_t CUBLAS_COMPUTE_DTYPE   = CUBLAS_COMPUTE_32F_FAST_TF32;


struct Shape

{

    static constexpr size_t MaxRank = 4;


    Index  dims[MaxRank] = {0};

    size_t rank          = 0;


    Shape() noexcept = default;


    Shape(size_t new_rank, Index value) : rank(new_rank)

    {

        if (new_rank > MaxRank)

            throw runtime_error(format("Shape: rank {} exceeds MaxRank={}.",

                                       new_rank, MaxRank));

        fill_n(dims, rank, value);

    }


    Shape(initializer_list<Index> list) : rank(list.size())

    {

        if (list.size() > MaxRank)

            throw runtime_error(format("Shape: initializer rank {} exceeds MaxRank={}.",

                                       list.size(), MaxRank));

        copy_n(list.begin(), rank, dims);

    }


    [[nodiscard]] const Index* begin() const noexcept { return dims; }

    [[nodiscard]] const Index* end()   const noexcept { return dims + rank; }

    [[nodiscard]] const Index& operator[](size_t i) const noexcept { return dims[i]; }

    Index&                     operator[](size_t i)       noexcept { return dims[i]; }


    Index&                     back()       { if (rank == 0) throw runtime_error("Shape::back() on empty"); return dims[rank - 1]; }

    [[nodiscard]] const Index& back() const { if (rank == 0) throw runtime_error("Shape::back() on empty"); return dims[rank - 1]; }


    [[nodiscard]] bool empty() const noexcept { return rank == 0; }


    [[nodiscard]] Index dim_or_zero(size_t i) const noexcept { return i < rank ? dims[i] : Index(0); }


    [[nodiscard]] Index size() const noexcept

    {

        return rank == 0 ? 0 : accumulate(begin(), end(), Index(1), multiplies<>{});

    }


    void clear() noexcept { rank = 0; }

    void push_back(Index value) noexcept { if (rank < MaxRank) dims[rank++] = value; }


    friend ostream& operator<<(ostream& os, const Shape& shape)

    {

        os << "[";

        for (size_t i = 0; i < shape.rank; ++i) os << (i ? ", " : " ") << shape.dims[i];

        os << " ]";

        return os;

    }


    [[nodiscard]] bool operator==(const Shape& other) const noexcept

    {

        return rank == other.rank && equal(begin(), end(), other.begin());

    }


    Shape& append(const Shape& other)

    {

        const size_t copy_count = min(other.rank, MaxRank - rank);

        copy_n(other.dims, copy_count, dims + rank);

        rank += copy_count;

        return *this;

    }


};


struct TensorSpec

{

    Shape shape;

    Type  dtype = Type::FP32;

};


[[nodiscard]] inline Index get_aligned_size(const vector<TensorSpec>& specs)

{

    return transform_reduce(specs.begin(), specs.end(), Index(0), plus<>{},

        [](const auto& spec) { return get_aligned_size(spec.shape.size()); });

}


[[nodiscard]] inline Index get_aligned_size(const vector<vector<TensorSpec>>& specs)

{

    return transform_reduce(specs.begin(), specs.end(), Index(0), plus<>{},

        [](const auto& s) { return get_aligned_size(s); });

}


[[nodiscard]] inline Index get_aligned_bytes(const vector<TensorSpec>& specs)

{

    return transform_reduce(specs.begin(), specs.end(), Index(0), plus<>{},

        [](const auto& spec) { return get_aligned_bytes(spec.shape.size(), spec.dtype); });

}


[[nodiscard]] inline Index get_aligned_bytes(const vector<vector<TensorSpec>>& specs)

{

    return transform_reduce(specs.begin(), specs.end(), Index(0), plus<>{},

        [](const auto& s) { return get_aligned_bytes(s); });

}


[[nodiscard]] inline Index get_aligned_bytes(const vector<Shape>& shapes, Type dtype)

{

    return transform_reduce(shapes.begin(), shapes.end(), Index(0), plus<>{},

        [dtype](const Shape& s) { return get_aligned_bytes(s.size(), dtype); });

}


[[nodiscard]] inline Index get_aligned_bytes(const vector<TensorSpec>& specs, Type dtype)

{

    return transform_reduce(specs.begin(), specs.end(), Index(0), plus<>{},

        [dtype](const auto& spec) { return get_aligned_bytes(spec.shape.size(), dtype); });

}


[[nodiscard]] inline Index get_aligned_bytes(const vector<vector<TensorSpec>>& specs, Type dtype)

{

    return transform_reduce(specs.begin(), specs.end(), Index(0), plus<>{},

        [dtype](const auto& s) { return get_aligned_bytes(s, dtype); });

}


struct Buffer

{

    void* data = nullptr;

    Index bytes = 0;

    Device device_type = Device::CPU;


    template<typename T> [[nodiscard]] T*       as()       { return static_cast<T*>(data); }

    template<typename T> [[nodiscard]] const T* as() const { return static_cast<const T*>(data); }


    [[nodiscard]] Index size_in_floats() const { return bytes / Index(sizeof(float)); }

    [[nodiscard]] bool  empty() const { return bytes == 0; }


    void resize_bytes(Index new_bytes, Device new_device_type)

    {

        if (new_bytes == bytes && device_type == new_device_type) return;

        free_buffer();

        if (new_bytes == 0) return;


        data = alloc(new_device_type, new_bytes);

        device_type = new_device_type;

        bytes = new_bytes;

    }


    void grow_to(Index new_bytes)

    {

        if (new_bytes > bytes)

            resize_bytes(new_bytes, device_type);

    }


    template<typename T>


    T* ensure(Index n_elements)

    {

        grow_to(n_elements * Index(sizeof(T)));

        return as<T>();

    }


    void setZero()

    {

        if (!data) return;

#ifdef OPENNN_HAS_CUDA

        if (device_type == Device::CUDA)

        {

            CHECK_CUDA(cudaMemset(data, 0, bytes));

            return;

        }

#endif

        memset(data, 0, static_cast<size_t>(bytes));

    }


#ifdef OPENNN_HAS_CUDA

    void migrate_to(Device target, cudaStream_t stream = nullptr)

    {

        if (device_type == target || !data) return;


        void* fresh = alloc(target, bytes);

        const cudaMemcpyKind kind = (target == Device::CUDA)

            ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToHost;


        if (stream)

        {

            CHECK_CUDA(cudaMemcpyAsync(fresh, data, bytes, kind, stream));

            CHECK_CUDA(cudaStreamSynchronize(stream));

        }

        else

        {

            CHECK_CUDA(cudaMemcpy(fresh, data, bytes, kind));

        }


        dealloc(device_type, data, bytes);

        data = fresh;

        device_type = target;

    }

#endif


    explicit Buffer(Device new_device_type = Device::CPU) noexcept : device_type(new_device_type) {}

    Buffer(const Buffer&) = delete;

    Buffer& operator=(const Buffer&) = delete;


    Buffer(Buffer&& other) noexcept : Buffer() { swap(other); }

    Buffer& operator=(Buffer&& other) noexcept { swap(other); return *this; }


    ~Buffer() { free_buffer(); }


    void swap(Buffer& other) noexcept

    {

        std::swap(data, other.data);

        std::swap(bytes, other.bytes);

        std::swap(device_type, other.device_type);

    }


private:

    static void* alloc([[maybe_unused]] Device device_type, Index byte_count)

    {

#ifdef OPENNN_HAS_CUDA

        if (device_type == Device::CUDA) { void* device_pointer = nullptr; CHECK_CUDA(cudaMalloc(&device_pointer, byte_count)); return device_pointer; }

#endif

        return Eigen::aligned_allocator<uint8_t>{}.allocate(static_cast<size_t>(byte_count));

    }


    static void dealloc([[maybe_unused]] Device device_type, void* pointer, Index byte_count)

    {

#ifdef OPENNN_HAS_CUDA

        if (device_type == Device::CUDA) { cudaFree(pointer); return; }

#endif

        Eigen::aligned_allocator<uint8_t>{}.deallocate(static_cast<uint8_t*>(pointer), static_cast<size_t>(byte_count));

    }


    void free_buffer()

    {

        if (data) dealloc(device_type, data, bytes);

        data = nullptr;

        bytes = 0;

    }

};


struct TensorView

{

    void* data = nullptr;


    Shape shape;


    Type type = Type::FP32;


    TensorView(void* new_data = nullptr, const Shape& new_shape = {},

               Type new_dtype = Type::FP32) noexcept

        : data(new_data), shape(new_shape), type(new_dtype) {}


    [[nodiscard]] Index get_rank() const noexcept { return shape.rank; }


    [[nodiscard]] Index size() const noexcept { return shape.size(); }


    [[nodiscard]] Index byte_size() const noexcept { return size() * type_bytes(type); }


    [[nodiscard]] bool empty() const noexcept { return shape.empty(); }


    template<typename T>


    [[nodiscard]] T* as() const noexcept

    {

        assert(data);

        return reinterpret_cast<T*>(data);

    }


    [[nodiscard]] float* as_float() const noexcept

    {

        return reinterpret_cast<float*>(data);

    }


    [[nodiscard]] cudaDataType_t cuda_dtype() const noexcept { return to_cuda(type); }


    template<typename F>


    void dispatch(F&& fn) const

    {

        visit_type<Type::FP32, Type::BF16>(type, [&](auto info)

        {

            fn(typename decltype(info)::type{});

        });

    }


    [[nodiscard]] TensorView reshape(const Shape& new_shape) const

    { return TensorView(data, new_shape, type); }


    [[nodiscard]] MatrixMap as_matrix() const

    {

        assert(shape.rank >= 2);

        return MatrixMap(as<float>(), shape[0], shape.size() / shape[0]);

    }


    [[nodiscard]] MatrixMap as_matrix(Index batch_index) const

    {

        assert(shape.rank >= 2);

        const Index rows = shape[shape.rank - 2];

        const Index cols = shape[shape.rank - 1];

        return MatrixMap(as<float>() + batch_index * rows * cols, rows, cols);

    }


    [[nodiscard]] MatrixMap as_flat_matrix() const

    {

        assert(shape.rank >= 1);

        const Index cols = shape[shape.rank - 1];

        return MatrixMap(as<float>(), shape.size() / cols, cols);

    }


    [[nodiscard]] MatrixMap as_flat_matrix(Index batch_index) const

    {

        assert(shape.rank >= 2);

        const Index cols = shape[shape.rank - 1];

        const Index rows = shape.size() / (shape[0] * cols);

        return MatrixMap(as<float>() + batch_index * rows * cols, rows, cols);

    }


    [[nodiscard]] VectorMap as_vector() const

    {

        return VectorMap(as<float>(), shape.size());

    }


    template<int Rank>


    [[nodiscard]] TensorMapR<Rank> as_tensor() const

    {

        assert(shape.rank == Rank);

        Eigen::array<Index, Rank> dims;

        copy_n(shape.dims, Rank, dims.begin());

        return TensorMapR<Rank>(as<float>(), dims);

    }


    template<int Rank>


    [[nodiscard]] TensorMapR<Rank> as_tensor(Index batch_index) const

    {

        assert(shape.rank == Rank + 1);

        Eigen::array<Index, Rank> dims;

        for (int i = 0; i < Rank; ++i) dims[i] = shape[i + 1];

        const Index slice_size = shape.size() / shape[0];

        return TensorMapR<Rank>(as<float>() + batch_index * slice_size, dims);

    }


    void fill(float value);

    void setZero() { fill(0.0f); }


#ifdef OPENNN_HAS_CUDA

    void set_zero_async() const;


    mutable shared_ptr<cudnnTensorStruct> descriptor_handle = nullptr;


    cudnnTensorDescriptor_t get_descriptor() const

    {

        if (!descriptor_handle && !shape.empty())

            set_descriptor(shape);

        return descriptor_handle.get();

    }


private:

    void set_descriptor(const Shape& shape) const

    {

        // NHWC layout: rank < 4 leading dims default to 1.

        int batch_count = 1, channels = 1, height = 1, width = 1;

        const size_t rank = shape.rank;

        if (rank >= 1) channels    = static_cast<int>(shape[rank - 1]);

        if (rank >= 2) batch_count = static_cast<int>(shape[0]);

        if (rank >= 3) width       = static_cast<int>(shape[rank - 2]);

        if (rank >= 4) height      = static_cast<int>(shape[rank - 3]);


        if (batch_count <= 0 || channels <= 0 || height <= 0 || width <= 0)

            return;


        if (!descriptor_handle)

        {

            cudnnTensorDescriptor_t raw_desc;

            CHECK_CUDNN(cudnnCreateTensorDescriptor(&raw_desc));


            descriptor_handle = shared_ptr<cudnnTensorStruct>(raw_desc, [](cudnnTensorDescriptor_t descriptor) {

                cudnnDestroyTensorDescriptor(descriptor);

            });

        }


        CHECK_CUDNN(cudnnSetTensor4dDescriptor(descriptor_handle.get(), CUDNN_TENSOR_NHWC, to_cudnn(type), batch_count, channels, height, width));

    }


#endif


};


inline TensorView& view_at_slot_or(vector<TensorView>& views,

                                   const vector<size_t>& slots, size_t i,

                                   TensorView& fallback)

{

    return i < slots.size() ? views[slots[i]] : fallback;

}


inline TensorView& view_at_slot_or(vector<vector<TensorView>>& views,

                                   const vector<size_t>& slots, size_t i,

                                   TensorView& fallback)

{

    return i < slots.size() ? views[slots[i]][0] : fallback;

}


template<typename T, size_t N>

using array = Eigen::array<T, N>;


[[nodiscard]] string shape_to_string(const Shape&, const string& = " ");

[[nodiscard]] Shape string_to_shape(const string&, const string& = " ");


// Boost-style hash combine. Mixes one or more values into a single size_t. Used

// for plan/graph cache keys (cuBLASLt, cuDNN SDPA).

template<typename... Vs>


[[nodiscard]] size_t hash_combine(const Vs&... values)

{

    size_t h = 0;

    ((h ^= hash<Vs>{}(values) + 0x9e3779b9 + (h << 6) + (h >> 2)), ...);

    return h;

}


class Backend

{

public:


    static Backend& instance();


    ThreadPoolDevice* get_thread_pool_device();


    void set_threads_number(int num_threads);


    static cublasHandle_t get_cublas_handle()                      { return instance().cublas_handle; }

    static cublasLtHandle_t get_cublas_lt_handle()                 { return instance().cublas_lt_handle; }

    static cudnnHandle_t get_cudnn_handle()                        { return instance().cudnn_handle; }

    static cudaStream_t get_compute_stream()                       { return instance().compute_stream; }

    static cudnnOpTensorDescriptor_t get_operator_sum_descriptor() { return instance().operator_sum_descriptor; }


private:

    Backend();

    ~Backend();


    unique_ptr<ThreadPool> thread_pool;

    unique_ptr<ThreadPoolDevice> thread_pool_device;


    cublasHandle_t cublas_handle = nullptr;

    cublasLtHandle_t cublas_lt_handle = nullptr;

    cudnnHandle_t cudnn_handle = nullptr;

    cudaStream_t compute_stream = nullptr;

    cudnnOpTensorDescriptor_t operator_sum_descriptor = nullptr;

};


inline ThreadPoolDevice& get_device()

{

    return *Backend::instance().get_thread_pool_device();

}


inline void TensorView::fill(float value)

{

    if (!data) return;


#ifdef OPENNN_HAS_CUDA

    // Probe the pointer: set_parameters_random() may call fill() on host-resident

    // TensorViews even when Device is already GPU.

    cudaPointerAttributes attr{};

    const cudaError_t err = cudaPointerGetAttributes(&attr, data);

    const bool gpu_data = (err == cudaSuccess) && (attr.type == cudaMemoryTypeDevice);

    if (err != cudaSuccess) cudaGetLastError();   // clear sticky error from CPU pointer probe


    if (gpu_data)

    {

        if (value == 0.0f)

        {

            CHECK_CUDA(cudaMemset(data, 0, byte_size()));

            return;

        }


        CHECK_CUDNN(cudnnSetTensor(Backend::get_cudnn_handle(),

                                   get_descriptor(), data, &value));

        return;

    }

#endif


    assert(type == Type::FP32);

    float* data_pointer = static_cast<float*>(data);

    std::fill(data_pointer, data_pointer + size(), value);

}


#ifdef OPENNN_HAS_CUDA


inline void TensorView::set_zero_async() const

{

    if (!data || byte_size() == 0) return;

    CHECK_CUDA(cudaMemsetAsync(data, 0, byte_size(), Backend::get_compute_stream()));

}


inline const float one = 1.0f;

inline const float zero = 0.0f;


// Sync D2H copy of a contiguous device buffer into an FP32 host buffer,

// upcasting BF16 -> FP32 inline. Blocks on `stream`. Throws on unsupported

// dtype. Allocates a uint16_t staging buffer for BF16 sources, so callers on

// hot paths may prefer an inline version with a pre-allocated staging buffer.

void copy_device_to_host_float(const void* device_src, Type src_dtype,

                               Index element_count, float* host_dst,

                               cudaStream_t stream);


#endif


}


// OpenNN: Open Neural Networks Library.

// Copyright(C) 2005-2026 Artificial Intelligence Techniques, SL.

// Licensed under the GNU Lesser General Public License v2.1 or later.

opennn::Backend
Process-wide singleton that owns the thread pool and the cuBLAS/cuDNN handles.
Definition tensor_utilities.h:491

opennn::Backend::get_compute_stream
static cudaStream_t get_compute_stream()
Default CUDA stream used by the compute backend.
Definition tensor_utilities.h:510

opennn::Backend::instance
static Backend & instance()
Returns the global Backend instance.

opennn::Backend::get_cudnn_handle
static cudnnHandle_t get_cudnn_handle()
Shared cuDNN handle.
Definition tensor_utilities.h:508

opennn::Backend::get_cublas_handle
static cublasHandle_t get_cublas_handle()
Shared cuBLAS handle for legacy GEMM calls.
Definition tensor_utilities.h:504

opennn::Backend::get_cublas_lt_handle
static cublasLtHandle_t get_cublas_lt_handle()
Shared cuBLASLt handle for batched/tuned GEMMs.
Definition tensor_utilities.h:506

opennn::Backend::get_operator_sum_descriptor
static cudnnOpTensorDescriptor_t get_operator_sum_descriptor()
cuDNN op-tensor descriptor configured for elementwise sum.
Definition tensor_utilities.h:512

opennn::Backend::get_thread_pool_device
ThreadPoolDevice * get_thread_pool_device()
Returns the Eigen ThreadPoolDevice used for CPU tensor evaluations.

opennn::Backend::set_threads_number
void set_threads_number(int num_threads)
Reconfigures the underlying thread pool to use num_threads workers.

configuration.h

opennn
Definition adaptive_moment_estimation.h:14

opennn::is_aligned
bool is_aligned(const void *ptr)
Definition tensor_utilities.h:32

opennn::hash_combine
size_t hash_combine(const Vs &... values)
Boost-style hash combine that mixes any number of hashable values into a single size_t.
Definition tensor_utilities.h:482

opennn::CUDA_REDUCTION_DTYPE
constexpr cudaDataType_t CUDA_REDUCTION_DTYPE
Definition tensor_utilities.h:37

opennn::to_type
float to_type(Index value)
Definition tensor_utilities.h:21

opennn::type_bytes
Index type_bytes(Type type) noexcept
Returns the byte size of one element of the given OpenNN Type.
Definition configuration.h:111

opennn::CUBLAS_COMPUTE_DTYPE
constexpr cublasComputeType_t CUBLAS_COMPUTE_DTYPE
Definition tensor_utilities.h:38

opennn::Device
Device
Execution device selection for OpenNN runtime (auto-detected, CPU or CUDA GPU).
Definition configuration.h:17

opennn::Device::CPU
@ CPU
Definition configuration.h:17

opennn::Device::CUDA
@ CUDA
Definition configuration.h:17

opennn::get_aligned_bytes
Index get_aligned_bytes(Index n_bytes)
Definition tensor_utilities.h:29

opennn::array
Eigen::array< T, N > array
Definition tensor_utilities.h:471

opennn::string_to_shape
Shape string_to_shape(const string &, const string &=" ")
Parses a separator-joined string of dimensions into a Shape.

opennn::get_device
ThreadPoolDevice & get_device()
Convenience accessor for the global Eigen ThreadPoolDevice.
Definition tensor_utilities.h:529

opennn::to_cudnn
cudnnDataType_t to_cudnn(Type type) noexcept
Returns the cuDNN data type matching the given OpenNN Type (Auto resolves to FP32).
Definition configuration.h:83

opennn::get_aligned_size
Index get_aligned_size(Index size)
Definition tensor_utilities.h:28

opennn::Type
Type
Numeric precision used for training or inference tensors.
Definition configuration.h:20

opennn::Type::FP32
@ FP32
Definition configuration.h:20

opennn::to_int
int to_int(Index value)
Definition tensor_utilities.h:20

opennn::visit_type
void visit_type(Type t, F &&f)
Dispatches f with the TypeInfo of the runtime Type t (must be in Supported).
Definition configuration.h:59

opennn::shape_to_string
string shape_to_string(const Shape &, const string &=" ")
Serializes a shape as a separator-joined string of dimensions.

opennn::to_cuda
cudaDataType_t to_cuda(Type type) noexcept
Returns the CUDA data type matching the given OpenNN Type (Auto resolves to FP32).
Definition configuration.h:97

opennn::ALIGN_BYTES
static constexpr Index ALIGN_BYTES
Definition tensor_utilities.h:17

opennn::align_up
Index align_up(Index value, Index alignment)
Definition tensor_utilities.h:23

opennn::ALIGN_ELEMENTS
static constexpr Index ALIGN_ELEMENTS
Definition tensor_utilities.h:18

opennn::view_at_slot_or
TensorView & view_at_slot_or(vector< TensorView > &views, const vector< size_t > &slots, size_t i, TensorView &fallback)
Definition tensor_utilities.h:456

pch.h

cudnnTensorDescriptor_t
cudnnTensorStruct * cudnnTensorDescriptor_t
Definition pch.h:109

VectorMap
Map< VectorR, AlignedMax > VectorMap
Definition pch.h:185

TensorMapR
TensorMap< Tensor< float, Rank, Layout|AlignedMax >, AlignedMax > TensorMapR
Definition pch.h:201

cudnnHandle_t
void * cudnnHandle_t
Definition pch.h:86

cudaStream_t
void * cudaStream_t
Definition pch.h:82

cudaDataType_t
cudaDataType_t
Definition pch.h:93

CUDA_R_32F
@ CUDA_R_32F
Definition pch.h:93

cublasLtHandle_t
void * cublasLtHandle_t
Definition pch.h:85

cudnnOpTensorDescriptor_t
void * cudnnOpTensorDescriptor_t
Definition pch.h:115

cublasHandle_t
void * cublasHandle_t
Definition pch.h:84

EIGEN_MAX_ALIGN_BYTES
#define EIGEN_MAX_ALIGN_BYTES
Definition pch.h:12

MatrixMap
Map< MatrixR, Layout|AlignedMax > MatrixMap
Definition pch.h:186

cublasComputeType_t
cublasComputeType_t
Definition pch.h:94

CUBLAS_COMPUTE_32F_FAST_TF32
@ CUBLAS_COMPUTE_32F_FAST_TF32
Definition pch.h:94

opennn::Buffer::operator=
Buffer & operator=(Buffer &&other) noexcept
Definition tensor_utilities.h:254

opennn::Buffer::empty
bool empty() const
Returns true if no storage is allocated.
Definition tensor_utilities.h:179

opennn::Buffer::size_in_floats
Index size_in_floats() const
Capacity expressed in float elements.
Definition tensor_utilities.h:177

opennn::Buffer::swap
void swap(Buffer &other) noexcept
Swaps storage with another buffer.
Definition tensor_utilities.h:259

opennn::Buffer::ensure
T * ensure(Index n_elements)
Ensures the buffer holds at least n_elements of T and returns a typed pointer.
Definition tensor_utilities.h:202

opennn::Buffer::device_type
Device device_type
Definition tensor_utilities.h:169

opennn::Buffer::grow_to
void grow_to(Index new_bytes)
Grows the buffer to at least new_bytes; no-op if already large enough.
Definition tensor_utilities.h:194

opennn::Buffer::Buffer
Buffer(Buffer &&other) noexcept
Definition tensor_utilities.h:253

opennn::Buffer::setZero
void setZero()
Zeros all bytes in the buffer (cudaMemset on device, memset on host).
Definition tensor_utilities.h:209

opennn::Buffer::bytes
Index bytes
Definition tensor_utilities.h:168

opennn::Buffer::data
void * data
Definition tensor_utilities.h:167

opennn::Buffer::Buffer
Buffer(Device new_device_type=Device::CPU) noexcept
Constructs an empty buffer targeting the given device type.
Definition tensor_utilities.h:249

opennn::Buffer::operator=
Buffer & operator=(const Buffer &)=delete

opennn::Buffer::~Buffer
~Buffer()
Definition tensor_utilities.h:256

opennn::Buffer::Buffer
Buffer(const Buffer &)=delete

opennn::Buffer::resize_bytes
void resize_bytes(Index new_bytes, Device new_device_type)
Resizes the buffer to new_bytes on new_device_type, freeing prior storage.
Definition tensor_utilities.h:182

opennn::Buffer::as
T * as()
Reinterprets the buffer as a typed pointer (no bounds checking).
Definition tensor_utilities.h:172

opennn::Buffer::as
const T * as() const
Reinterprets the buffer as a typed const pointer.
Definition tensor_utilities.h:174

opennn::Shape
Fixed-capacity small-vector describing tensor dimensions (rank up to MaxRank).
Definition tensor_utilities.h:42

opennn::Shape::operator[]
Index & operator[](size_t i) noexcept
Definition tensor_utilities.h:71

opennn::Shape::begin
const Index * begin() const noexcept
Definition tensor_utilities.h:68

opennn::Shape::Shape
Shape() noexcept=default

opennn::Shape::operator<<
friend ostream & operator<<(ostream &os, const Shape &shape)
Definition tensor_utilities.h:92

opennn::Shape::dim_or_zero
Index dim_or_zero(size_t i) const noexcept
Returns dims[i] when i is in range, otherwise 0.
Definition tensor_utilities.h:79

opennn::Shape::MaxRank
static constexpr size_t MaxRank
Definition tensor_utilities.h:43

opennn::Shape::back
const Index & back() const
Definition tensor_utilities.h:74

opennn::Shape::push_back
void push_back(Index value) noexcept
Appends a dimension to the shape (silently no-op if already at MaxRank).
Definition tensor_utilities.h:90

opennn::Shape::empty
bool empty() const noexcept
Definition tensor_utilities.h:76

opennn::Shape::rank
size_t rank
Definition tensor_utilities.h:46

opennn::Shape::Shape
Shape(initializer_list< Index > list)
Builds a shape from a brace-enclosed list of dimensions.
Definition tensor_utilities.h:60

opennn::Shape::operator[]
const Index & operator[](size_t i) const noexcept
Definition tensor_utilities.h:70

opennn::Shape::dims
Index dims[MaxRank]
Definition tensor_utilities.h:45

opennn::Shape::end
const Index * end() const noexcept
Definition tensor_utilities.h:69

opennn::Shape::operator==
bool operator==(const Shape &other) const noexcept
Definition tensor_utilities.h:100

opennn::Shape::clear
void clear() noexcept
Resets the shape to rank 0 without freeing storage.
Definition tensor_utilities.h:88

opennn::Shape::append
Shape & append(const Shape &other)
Appends another shape's dimensions to this one, stopping at MaxRank.
Definition tensor_utilities.h:106

opennn::Shape::size
Index size() const noexcept
Returns the number of elements (product of all dimensions).
Definition tensor_utilities.h:82

opennn::Shape::back
Index & back()
Definition tensor_utilities.h:73

opennn::TensorSpec
Lightweight description of a tensor's shape and data type (no storage attached).
Definition tensor_utilities.h:117

opennn::TensorSpec::shape
Shape shape
Definition tensor_utilities.h:118

opennn::TensorSpec::dtype
Type dtype
Definition tensor_utilities.h:119

opennn::TensorView
Non-owning view over a tensor: pointer, shape, and data type with rich reshape helpers.
Definition tensor_utilities.h:293

opennn::TensorView::as_matrix
MatrixMap as_matrix() const
Maps the view to an Eigen matrix: rows = first dim, cols = product of the rest.
Definition tensor_utilities.h:349

opennn::TensorView::empty
bool empty() const noexcept
Returns true if the shape is empty.
Definition tensor_utilities.h:315

opennn::TensorView::setZero
void setZero()
Zeros every element of the view.
Definition tensor_utilities.h:411

opennn::TensorView::size
Index size() const noexcept
Total element count.
Definition tensor_utilities.h:309

opennn::TensorView::byte_size
Index byte_size() const noexcept
Total byte count (size() * sizeof(dtype)).
Definition tensor_utilities.h:312

opennn::TensorView::as_flat_matrix
MatrixMap as_flat_matrix() const
Maps the view to an Eigen matrix flattened across all leading dimensions.
Definition tensor_utilities.h:365

opennn::TensorView::fill
void fill(float value)
Sets every element of the view to the given value, dispatching CPU/GPU as needed.
Definition tensor_utilities.h:534

opennn::TensorView::as_tensor
TensorMapR< Rank > as_tensor(Index batch_index) const
Maps a single batch slice of the view to an Eigen Tensor of rank Rank.
Definition tensor_utilities.h:399

opennn::TensorView::get_rank
Index get_rank() const noexcept
Number of dimensions in the view.
Definition tensor_utilities.h:306

opennn::TensorView::as_vector
VectorMap as_vector() const
Maps the view to a flat Eigen vector.
Definition tensor_utilities.h:382

opennn::TensorView::cuda_dtype
cudaDataType_t cuda_dtype() const noexcept
Returns the CUDA data type tag corresponding to this view's dtype.
Definition tensor_utilities.h:332

opennn::TensorView::as_float
float * as_float() const noexcept
Reinterprets the view's data as a float pointer.
Definition tensor_utilities.h:326

opennn::TensorView::reshape
TensorView reshape(const Shape &new_shape) const
Returns a new view over the same memory with a different shape.
Definition tensor_utilities.h:345

opennn::TensorView::shape
Shape shape
Definition tensor_utilities.h:296

opennn::TensorView::as
T * as() const noexcept
Reinterprets the view's data as a pointer to T (no type checking).
Definition tensor_utilities.h:319

opennn::TensorView::TensorView
TensorView(void *new_data=nullptr, const Shape &new_shape={}, Type new_dtype=Type::FP32) noexcept
Constructs a view from an external buffer, shape, and dtype.
Definition tensor_utilities.h:301

opennn::TensorView::data
void * data
Definition tensor_utilities.h:294

opennn::TensorView::as_tensor
TensorMapR< Rank > as_tensor() const
Maps the view to an Eigen Tensor of the given rank.
Definition tensor_utilities.h:389

opennn::TensorView::dispatch
void dispatch(F &&fn) const
Dispatches a callable on the concrete element type (FP32 or BF16).
Definition tensor_utilities.h:336

opennn::TensorView::as_matrix
MatrixMap as_matrix(Index batch_index) const
Maps a single batch slice of the view to an Eigen matrix.
Definition tensor_utilities.h:356

opennn::TensorView::as_flat_matrix
MatrixMap as_flat_matrix(Index batch_index) const
Flat-matrix view of a single batch slice.
Definition tensor_utilities.h:373

opennn::TensorView::type
Type type
Definition tensor_utilities.h:298