117 void check(
const vector<TensorView>& inputs,
const TensorView& output)
const;
161 void apply_delta_cpu(
TensorView& delta) const;
162 void apply_delta_gpu(
TensorView& delta) const;
164 void ensure_mask(Index n);
247 void set(Index new_input_features, Index new_output_features,
Type new_weight_type =
Type::FP32);
272 bool accumulate_input_delta =
false)
const;
279 TensorView& input_delta,
bool accumulate_input_delta)
const;
281 TensorView& input_delta,
bool accumulate_input_delta)
const;
326 void set(Index new_features,
float new_momentum = 0.1f);
371 bool inference_cache_dirty =
true;
373 mutable VectorR delta_scale_scratch;
378 void apply_training_cpu (
const TensorView& input,
381 void apply_training_gpu (
const TensorView& input,
417#ifdef OPENNN_HAS_CUDA
425 size_t cudnn_workspace_size_ = 0;
431 Index planned_batch_size = 0;
446 void set(Index input_h, Index input_w,
447 Index kernels_n, Index kernel_h, Index kernel_w, Index kernel_c,
448 Index row_stride, Index column_stride,
449 Index padding_h, Index padding_w,
502 void set(Index input_h, Index input_w,
503 Index kernels_n, Index kernel_h, Index kernel_w, Index kernel_c,
504 Index row_stride, Index column_stride,
505 Index padding_h, Index padding_w,
567 void apply_delta_cpu(
const TensorView& output_delta,
637 float* scratch)
const;
746 float scaling_factor() const;
793 void apply_delta_gpu_unfused(const
TensorView& query,
804 static
bool get_contiguous_source_lengths(const
TensorView& source_input,
805 vector<Index>& lengths,
807 static
void softmax_rows_prefix(
float* matrix, Index rows, Index cols, Index length);
808 static Index infer_attention_prefix_length(const
TensorView& attention_weights,
813 template<typename SoftmaxBwd>
814 void apply_delta_unfused(const
TensorView& query,
824 SoftmaxBwd&& softmax_bwd) const;
826 mutable unique_ptr<SDPACache> sdpa_cache;
831 uint64_t sdpa_dropout_seed = 0x9E3779B97F4A7C15ULL;
832 uint64_t sdpa_dropout_offset = 0;
833 mutable uint64_t sdpa_last_used_offset = 0;
877#ifdef OPENNN_HAS_CUDA
892 void set(Index input_h, Index input_w, Index input_c,
893 Index pool_h, Index pool_w,
895 Index padding_h, Index padding_w,
919 void apply_delta_cpu(
const TensorView& output_delta,
966 void set(Index new_vocabulary_size, Index new_sequence_length, Index new_embedding_dimension);
Definition adaptive_moment_estimation.h:14
float mean(const VectorR &)
Arithmetic mean of a vector, ignoring NaNs.
Eigen::array< T, N > array
Definition tensor_utilities.h:471
VectorI maximal_indices(const VectorR &, Index)
Indices of the n largest elements of a vector.
Type
Numeric precision used for training or inference tensors.
Definition configuration.h:20
@ FP32
Definition configuration.h:20
Matrix< float, Dynamic, 1 > VectorR
Definition pch.h:181
cudnnConvolutionFwdAlgo_t
Definition pch.h:103
@ CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
Definition pch.h:103
void * cudnnFilterDescriptor_t
Definition pch.h:110
cudnnActivationMode_t
Definition pch.h:100
cublasLtEpilogue_t
Definition pch.h:96
@ CUBLASLT_EPILOGUE_BIAS
Definition pch.h:96
void * cudnnPoolingDescriptor_t
Definition pch.h:112
cudnnConvolutionBwdDataAlgo_t
Definition pch.h:104
@ CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
Definition pch.h:104
Matrix< float, Dynamic, Dynamic, Layout > MatrixR
Definition pch.h:177
void * cudnnActivationDescriptor_t
Definition pch.h:113
void * cudnnConvolutionDescriptor_t
Definition pch.h:111
cudnnConvolutionBwdFilterAlgo_t
Definition pch.h:105
@ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
Definition pch.h:105
Element-wise non-linear activation (Identity, Sigmoid, Tanh, ReLU, Softmax).
Definition operators.h:169
void apply_delta(const TensorView &outputs, TensorView &delta) const
Multiplies delta by the derivative of the activation evaluated at outputs.
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void to_JSON(JsonWriter &w) const override
Serializes the operator configuration to a JSON writer.
static const string & to_string(Function function)
Returns the string name of a Function.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void destroy_cuda() override
Releases CUDA resources owned by the operator; called from destructors.
vector< size_t > output_slots_backward
Definition operators.h:192
void apply_gpu(TensorView &output)
GPU forward implementation; applies the activation in place on output.
Function
Supported activation functions.
Definition operators.h:171
@ Sigmoid
Definition operators.h:171
@ Softmax
Definition operators.h:171
@ Identity
Definition operators.h:171
@ Tanh
Definition operators.h:171
@ ReLU
Definition operators.h:171
void set_function(Function new_function)
Selects the activation function and configures cuDNN descriptors.
cudnnActivationDescriptor_t descriptor
Definition operators.h:187
ActivationOp(const ActivationOp &)=delete
Function function
Definition operators.h:185
static const EnumMap< Function > & map()
Returns the bidirectional mapping between Function values and their string names.
void set_function(const string &name)
Selects the activation function by name (delegates to set_function(Function)).
ActivationOp & operator=(const ActivationOp &)=delete
static cudnnActivationMode_t to_cudnn_mode(Function function)
Returns the cuDNN activation mode corresponding to a Function.
static Function from_string(const string &name)
Returns the Function corresponding to a string name.
void apply_cpu(TensorView &output)
CPU forward implementation; applies the activation in place on output.
~ActivationOp() override
Definition operators.h:219
void from_JSON(const Json *parent) override
Restores the operator configuration from a JSON node.
Element-wise sum of several input tensors (used by residual connections).
Definition operators.h:110
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
vector< TensorSpec > forward_scratch_specs(Index batch_size) const
Returns the tensor specs of the forward-pass scratch buffers used by the operator.
Index source_sequence_length
Definition operators.h:646
Index query_sequence_length
Definition operators.h:645
bool use_causal_mask
Definition operators.h:647
void destroy_cuda() override
Releases CUDA resources owned by the operator; called from destructors.
void set(Index heads_number, Index head_dimension, Index query_sequence_length, Index source_sequence_length, bool use_causal_mask, Type compute_dtype)
Configures the attention geometry and compute precision.
MatrixR causal_mask
Definition operators.h:650
void set_dropout_rate(float rate)
Sets the post-softmax dropout rate (0 disables dropout).
Definition operators.h:666
DropoutOp dropout
Definition operators.h:652
void apply(const TensorView &query, const TensorView &key, const TensorView &value, const TensorView &source_input, TensorView &attention_weights, TensorView &attention_weights_dropped, TensorView &output, float *mask_scratch, bool is_training)
Computes attention output from Q, K, V; applies softmax, mask and dropout in place.
void apply_delta(const TensorView &query, const TensorView &key, const TensorView &value, const TensorView &attention_output, const TensorView &attention_weights, const TensorView &attention_weights_dropped, const TensorView &output_delta, TensorView &attention_weight_delta, TensorView &query_delta, TensorView &key_delta, TensorView &value_delta) const
Computes Q/K/V gradients from the output gradient and cached forward activations.
vector< size_t > attention_output_slots
Definition operators.h:680
void from_JSON(const Json *parent) override
Restores the operator configuration from a JSON node.
Index head_dimension
Definition operators.h:644
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
Type compute_dtype
Definition operators.h:648
AttentionOp(AttentionOp &&) noexcept
void to_JSON(JsonWriter &w) const override
Serializes the operator configuration to a JSON writer.
Index heads_number
Definition operators.h:643
vector< size_t > scratch_slots
Definition operators.h:679
size_t source_view_index
Definition operators.h:677
Workspace holding parameter gradients and per-layer deltas during a backward pass.
Definition back_propagation.h:21
vector< vector< TensorView > > delta_views
Definition back_propagation.h:44
Batch normalization with learnable scale/shift and running statistics for inference.
Definition operators.h:308
void load_state_from_JSON(const Json *parent) override
Restores persistent state (e.g. running statistics) from a JSON node.
Index features
Definition operators.h:309
void set(Index new_features, float new_momentum=0.1f)
Configures the per-feature normalization.
void link_states(span< const TensorView > views) override
Binds state views provided by the hosting layer.
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
TensorView gamma
Definition operators.h:312
void init_defaults()
Resets gamma to one, beta to zero, and running stats to identity values.
void apply_delta(const TensorView &input, const TensorView &mean, const TensorView &inverse_variance, TensorView &delta) const
Computes the input gradient given the cached normalization statistics from forward.
void invalidate_inference_cache()
Marks the inference cache as stale so it is rebuilt on the next inference call.
Definition operators.h:362
vector< TensorSpec > state_specs() const override
Returns the tensor specs of persistent state owned by this operator.
float momentum
Definition operators.h:310
void from_JSON(const Json *parent) override
Restores the operator configuration from a JSON node.
TensorView beta_gradient
Definition operators.h:318
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void update_inference_cache()
Rebuilds the fused scale/shift cache used by the inference path from running stats.
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
Definition operators.h:335
void set_parameters_random() override
Initializes parameters with random values.
Definition operators.h:334
TensorView beta
Definition operators.h:313
TensorView gamma_gradient
Definition operators.h:317
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
TensorView running_mean
Definition operators.h:314
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
bool active() const
Returns true when the operator has been configured (features > 0).
Definition operators.h:321
void to_JSON(JsonWriter &w) const override
Serializes the operator configuration to a JSON writer.
TensorView running_variance
Definition operators.h:315
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Clamps each output channel to a configurable lower/upper interval.
Definition operators.h:1004
TensorView upper
Definition operators.h:1011
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
Method
Disables bounding or enables per-channel clamping.
Definition operators.h:1006
@ Bounding
Definition operators.h:1006
@ NoBounding
Definition operators.h:1006
Method method
Definition operators.h:1008
TensorView lower
Definition operators.h:1010
Owning raw byte buffer that lives on CPU or CUDA memory, with aligned (re)allocation.
Definition tensor_utilities.h:166
Affine combination output = input * weights + bias (the dense matmul building block).
Definition operators.h:232
TensorView bias_gradient
Definition operators.h:241
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
TensorView weights
Definition operators.h:237
void set_parameters_random() override
Initializes parameters with random values.
Index input_features
Definition operators.h:233
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
Type weight_type
Definition operators.h:235
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void apply(const TensorView &input, TensorView &output, cublasLtEpilogue_t epilogue=CUBLASLT_EPILOGUE_BIAS)
Computes output = input * weights + bias with an optional fused epilogue (ReLU, bias,...
TensorView weight_gradient
Definition operators.h:240
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
void apply_delta(const TensorView &output_delta, const TensorView &input, TensorView &input_delta, bool accumulate_input_delta=false) const
Computes input_delta from output_delta and updates weight/bias gradients.
TensorView bias
Definition operators.h:238
Index output_features
Definition operators.h:234
void set(Index new_input_features, Index new_output_features, Type new_weight_type=Type::FP32)
Configures input/output dimensions and the weight storage dtype.
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
Fused affine + ReLU activation (uses cuBLASLt epilogue on GPU when available).
Definition operators.h:286
void set(Index input_features, Index output_features, Type weight_type=Type::FP32)
Configures the underlying CombinationOp; ReLU is fixed.
ActivationOp activation
Definition operators.h:288
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
Definition operators.h:298
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
Definition operators.h:294
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
Definition operators.h:293
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void set_parameters_random() override
Initializes parameters with random values.
Definition operators.h:297
CombinationOp combination
Definition operators.h:287
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
Definition operators.h:295
2D convolution operator (NHWC layout) backed by Eigen on CPU and cuDNN on GPU.
Definition operators.h:397
TensorView bias
Definition operators.h:412
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
Index kernel_channels
Definition operators.h:404
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
TensorView weight_gradient
Definition operators.h:414
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
void set(Index input_h, Index input_w, Index kernels_n, Index kernel_h, Index kernel_w, Index kernel_c, Index row_stride, Index column_stride, Index padding_h, Index padding_w, Type compute_dtype)
Configures the convolution geometry and compute precision.
ConvolutionOp & operator=(const ConvolutionOp &)=delete
TensorView bias_gradient
Definition operators.h:415
Index input_width
Definition operators.h:399
~ConvolutionOp() override
Definition operators.h:461
Index padding_height
Definition operators.h:406
Index padding_width
Definition operators.h:407
ConvolutionOp(const ConvolutionOp &)=delete
void destroy_cuda() override
Releases CUDA resources owned by the operator; called from destructors.
Index kernel_width
Definition operators.h:403
Type compute_dtype
Definition operators.h:409
Index kernel_height
Definition operators.h:402
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
void set_parameters_random() override
Initializes parameters with random values.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
Index kernels_number
Definition operators.h:401
Index input_height
Definition operators.h:398
void apply_cpu(const TensorView &input, TensorView &output)
CPU forward path; im2col + GEMM.
void apply_delta(const TensorView &input, const TensorView &output_delta, TensorView &input_delta) const
Computes input_delta from output_delta and updates weight/bias gradients.
void apply_gpu(const TensorView &input, TensorView &output, cudnnActivationDescriptor_t fused_activation=nullptr)
GPU forward path; runs cuDNN convolution with an optional fused activation.
TensorView weights
Definition operators.h:411
ConvolutionReluOp()=default
void set(Index input_h, Index input_w, Index kernels_n, Index kernel_h, Index kernel_w, Index kernel_c, Index row_stride, Index column_stride, Index padding_h, Index padding_w, Type compute_dtype)
Configures the underlying ConvolutionOp; ReLU is fixed.
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
Definition operators.h:508
void set_parameters_random() override
Initializes parameters with random values.
Definition operators.h:512
~ConvolutionReluOp() override
Definition operators.h:516
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
Definition operators.h:510
ActivationOp activation
Definition operators.h:499
ConvolutionReluOp(const ConvolutionReluOp &)=delete
ConvolutionOp convolution
Definition operators.h:498
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
Definition operators.h:513
void destroy_cuda() override
Releases CUDA resources owned by the operator; called from destructors.
Definition operators.h:515
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
Definition operators.h:509
ConvolutionReluOp & operator=(const ConvolutionReluOp &)=delete
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Inverted dropout: at training time zeros activations with probability rate and rescales survivors.
Definition operators.h:122
bool active() const
Returns true when the dropout rate is non-zero.
Definition operators.h:130
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
vector< size_t > save_slots
Definition operators.h:127
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
DropoutOp(DropoutOp &&) noexcept=default
float rate
Definition operators.h:123
void apply_gpu(TensorView &output)
GPU forward implementation; samples the mask and rescales survivors in place.
void to_JSON(JsonWriter &w) const override
Serializes the operator configuration to a JSON writer.
~DropoutOp() override
Definition operators.h:154
Buffer mask
Definition operators.h:125
void set_rate(float new_rate)
Sets the drop probability (0 disables dropout).
void from_JSON(const Json *parent) override
Restores the operator configuration from a JSON node.
void apply_cpu(TensorView &output)
CPU forward implementation; samples the mask and rescales survivors in place.
void apply_delta(TensorView &delta) const
Applies the cached mask to a gradient tensor during the backward pass.
void destroy_cuda() override
Releases CUDA resources owned by the operator; called from destructors.
Token embedding lookup with optional scaling and additive positional encoding.
Definition operators.h:947
TensorView weights
Definition operators.h:957
bool add_positional_encoding
Definition operators.h:953
TensorView weight_gradient
Definition operators.h:960
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
void init_positional_encoding()
Fills the positional-encoding state tensor with the standard sinusoidal pattern.
bool scale_embedding
Definition operators.h:952
Index sequence_length
Definition operators.h:949
Index vocabulary_size
Definition operators.h:948
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
TensorView positional_encoding
Definition operators.h:958
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
float embedding_scale
Definition operators.h:955
vector< TensorSpec > state_specs() const override
Returns the tensor specs of persistent state owned by this operator.
void link_states(span< const TensorView > views) override
Binds state views provided by the hosting layer.
Index embedding_dimension
Definition operators.h:950
void set(Index new_vocabulary_size, Index new_sequence_length, Index new_embedding_dimension)
Configures the lookup table dimensions.
void set_parameters_random() override
Initializes parameters with random values.
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
Flattens a multi-dimensional tensor into a 2D (batch, features) tensor.
Definition operators.h:995
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Workspace holding the activations of every layer during a forward pass.
Definition forward_propagation.h:20
vector< vector< vector< TensorView > > > views
Definition forward_propagation.h:45
Layer normalization with learnable scale/shift, applied across the embedding dimension.
Definition operators.h:530
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
TensorView gamma_gradient
Definition operators.h:537
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
TensorView gamma
Definition operators.h:534
void set(Index sequence_length, Index embedding_dimension)
Configures the operator for a (sequence_length, embedding_dimension) input.
Index sequence_length
Definition operators.h:531
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
Definition operators.h:548
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Index embedding_dimension
Definition operators.h:532
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
TensorView beta
Definition operators.h:535
TensorView beta_gradient
Definition operators.h:538
void init_defaults()
Resets gamma to one and beta to zero.
void set_parameters_random() override
Initializes parameters with random values.
Definition operators.h:547
Reshapes (batch, heads, seq, head_dim) tensors back into (batch, seq, embed); no parameters.
Definition operators.h:839
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
Index head_dimension
Definition operators.h:842
Index query_sequence_length
Definition operators.h:841
Type compute_dtype
Definition operators.h:843
void set(Index heads_number, Index query_sequence_length, Index head_dimension, Type compute_dtype)
Configures the merge geometry.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Index heads_number
Definition operators.h:840
Projects (input_features) into (heads * head_dim) and reshapes for multi-head attention.
Definition operators.h:579
bool accumulate_input_delta_cross
Definition operators.h:600
Index head_dimension
Definition operators.h:583
CombinationOp combination
Definition operators.h:580
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Index input_features
Definition operators.h:581
vector< size_t > input_delta_slots_cross
Definition operators.h:598
vector< size_t > scratch_slots
Definition operators.h:592
vector< size_t > input_delta_slots_self
Definition operators.h:597
Type compute_dtype
Definition operators.h:584
bool accumulate_input_delta_self
Definition operators.h:599
void apply_delta(const TensorView &head_delta, const TensorView &input, TensorView &input_delta, bool accumulate, float *scratch) const
Computes input_delta from per-head gradients and updates the projection weight gradient.
void set_parameters_random() override
Initializes parameters with random values.
Definition operators.h:613
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void apply(const TensorView &input, TensorView &head_output, float *scratch)
Projects input and reshapes the result into per-head form in head_output.
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
Definition operators.h:610
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
Definition operators.h:614
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
Definition operators.h:609
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
Definition operators.h:611
Index heads_number
Definition operators.h:582
void set(Index input_features, Index heads_number, Index head_dimension, Type compute_dtype)
Configures the projection geometry.
size_t input_view_index
Definition operators.h:589
Base class for compute building blocks composed by layers (matmul, activation, dropout,...
Definition operators.h:28
virtual void destroy_cuda()
Releases CUDA resources owned by the operator; called from destructors.
Definition operators.h:74
virtual vector< TensorSpec > parameter_specs() const
Returns the tensor specs of trainable parameters owned by this operator.
Definition operators.h:32
TensorView & get_input(ForwardPropagation &fp, size_t layer, size_t i=0) const noexcept
Definition operators.h:82
virtual void link_parameters(span< const TensorView >)
Binds parameter views provided by the hosting layer.
Definition operators.h:38
TensorView & get_output(ForwardPropagation &fp, size_t layer, size_t i=0) const noexcept
Definition operators.h:92
virtual void back_propagate(ForwardPropagation &, BackPropagation &, size_t) const noexcept
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Definition operators.h:62
virtual void from_JSON(const Json *)
Restores the operator configuration from a JSON node.
Definition operators.h:68
virtual void to_JSON(JsonWriter &) const
Serializes the operator configuration to a JSON writer.
Definition operators.h:65
virtual void link_gradients(span< const TensorView >)
Binds gradient views provided by the hosting layer.
Definition operators.h:41
virtual void load_state_from_JSON(const Json *)
Restores persistent state (e.g. running statistics) from a JSON node.
Definition operators.h:71
TensorView & get_output_delta(BackPropagation &bp, size_t layer, size_t i=0) const noexcept
Definition operators.h:97
vector< TensorView > & get_inputs(ForwardPropagation &fp, size_t layer, size_t i=0) const noexcept
Definition operators.h:87
vector< size_t > output_slots
Definition operators.h:77
TensorView & get_input_delta(BackPropagation &bp, size_t layer, size_t i=0) const noexcept
Definition operators.h:102
vector< size_t > output_delta_slots
Definition operators.h:80
vector< size_t > input_delta_slots
Definition operators.h:79
virtual void link_states(span< const TensorView >)
Binds state views provided by the hosting layer.
Definition operators.h:44
virtual void forward_propagate(ForwardPropagation &, size_t, bool) noexcept
Runs the operator's forward computation.
Definition operators.h:56
virtual void set_parameters_glorot()
Initializes parameters using Glorot (Xavier) initialization.
Definition operators.h:50
virtual ~Operator()=default
vector< size_t > input_slots
Definition operators.h:76
virtual void set_parameters_random()
Initializes parameters with random values.
Definition operators.h:47
virtual vector< TensorSpec > state_specs() const
Returns the tensor specs of persistent state owned by this operator.
Definition operators.h:35
Sequence-wide 1D pooling over the embedding dimension (mean or max).
Definition operators.h:930
Method method
Definition operators.h:933
Method
Supported pooling reductions.
Definition operators.h:932
@ Max
Definition operators.h:932
@ Average
Definition operators.h:932
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void set(Index input_h, Index input_w, Index input_c, Index pool_h, Index pool_w, Index row_stride, Index column_stride, Index padding_h, Index padding_w, Method method)
Configures the pooling geometry.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
PoolOp(const PoolOp &)=delete
Method
Supported pooling reductions.
Definition operators.h:862
@ Average
Definition operators.h:862
@ Max
Definition operators.h:862
Index padding_height
Definition operators.h:872
Index input_channels
Definition operators.h:866
~PoolOp() override
Definition operators.h:900
Index row_stride
Definition operators.h:870
Index pool_height
Definition operators.h:868
Index column_stride
Definition operators.h:871
Index pool_width
Definition operators.h:869
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
Method method
Definition operators.h:875
PoolOp & operator=(const PoolOp &)=delete
Index input_height
Definition operators.h:864
Index input_width
Definition operators.h:865
Index padding_width
Definition operators.h:873
void destroy_cuda() override
Releases CUDA resources owned by the operator; called from destructors.
Scales inputs to a target range using per-feature minimum/maximum or mean/std statistics.
Definition operators.h:1019
TensorView minimums
Definition operators.h:1023
float min_range
Definition operators.h:1020
TensorView means
Definition operators.h:1025
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
TensorView maximums
Definition operators.h:1024
float max_range
Definition operators.h:1021
TensorView scalers
Definition operators.h:1027
TensorView standard_deviations
Definition operators.h:1026
Non-owning view over a tensor: pointer, shape, and data type with rich reshape helpers.
Definition tensor_utilities.h:293
Inverse of ScaleOp: maps normalized outputs back to the original feature range.
Definition operators.h:1035
TensorView scalers
Definition operators.h:1043
float max_range
Definition operators.h:1037
TensorView means
Definition operators.h:1041
TensorView maximums
Definition operators.h:1040
TensorView minimums
Definition operators.h:1039
TensorView standard_deviations
Definition operators.h:1042
float min_range
Definition operators.h:1036
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.