OpenNN
Open-source neural networks library
Loading...
Searching...
No Matches
opennn::AttentionOp Struct Reference

Scaled dot-product attention with optional causal mask and dropout. More...

#include <operators.h>

Inheritance diagram for opennn::AttentionOp:
[legend]

Public Member Functions

void set (Index heads_number, Index head_dimension, Index query_sequence_length, Index source_sequence_length, bool use_causal_mask, Type compute_dtype)
 Configures the attention geometry and compute precision.
 
void set_dropout_rate (float rate)
 Sets the post-softmax dropout rate (0 disables dropout).
 
vector< TensorSpecforward_scratch_specs (Index batch_size) const
 Returns the tensor specs of the forward-pass scratch buffers used by the operator.
 
void forward_propagate (ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
 Runs the operator's forward computation.
 
void back_propagate (ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
 Runs the operator's backward computation, accumulating into gradient/delta buffers.
 
void apply (const TensorView &query, const TensorView &key, const TensorView &value, const TensorView &source_input, TensorView &attention_weights, TensorView &attention_weights_dropped, TensorView &output, float *mask_scratch, bool is_training)
 Computes attention output from Q, K, V; applies softmax, mask and dropout in place.
 
void apply_delta (const TensorView &query, const TensorView &key, const TensorView &value, const TensorView &attention_output, const TensorView &attention_weights, const TensorView &attention_weights_dropped, const TensorView &output_delta, TensorView &attention_weight_delta, TensorView &query_delta, TensorView &key_delta, TensorView &value_delta) const
 Computes Q/K/V gradients from the output gradient and cached forward activations.
 
void to_JSON (JsonWriter &w) const override
 Serializes the operator configuration to a JSON writer.
 
void from_JSON (const Json *parent) override
 Restores the operator configuration from a JSON node.
 
void destroy_cuda () override
 Releases CUDA resources owned by the operator; called from destructors.
 
 AttentionOp ()
 
 ~AttentionOp () override
 
 AttentionOp (AttentionOp &&) noexcept
 
AttentionOpoperator= (AttentionOp &&) noexcept
 
 AttentionOp (const AttentionOp &)=delete
 
AttentionOpoperator= (const AttentionOp &)=delete
 
- Public Member Functions inherited from opennn::Operator
virtual ~Operator ()=default
 
virtual vector< TensorSpecparameter_specs () const
 Returns the tensor specs of trainable parameters owned by this operator.
 
virtual vector< TensorSpecstate_specs () const
 Returns the tensor specs of persistent state owned by this operator.
 
virtual void link_parameters (span< const TensorView >)
 Binds parameter views provided by the hosting layer.
 
virtual void link_gradients (span< const TensorView >)
 Binds gradient views provided by the hosting layer.
 
virtual void link_states (span< const TensorView >)
 Binds state views provided by the hosting layer.
 
virtual void set_parameters_random ()
 Initializes parameters with random values.
 
virtual void set_parameters_glorot ()
 Initializes parameters using Glorot (Xavier) initialization.
 
virtual void load_state_from_JSON (const Json *)
 Restores persistent state (e.g. running statistics) from a JSON node.
 
TensorViewget_input (ForwardPropagation &fp, size_t layer, size_t i=0) const noexcept
 
vector< TensorView > & get_inputs (ForwardPropagation &fp, size_t layer, size_t i=0) const noexcept
 
TensorViewget_output (ForwardPropagation &fp, size_t layer, size_t i=0) const noexcept
 
TensorViewget_output_delta (BackPropagation &bp, size_t layer, size_t i=0) const noexcept
 
TensorViewget_input_delta (BackPropagation &bp, size_t layer, size_t i=0) const noexcept
 

Public Attributes

Index heads_number = 0
 
Index head_dimension = 0
 
Index query_sequence_length = 0
 
Index source_sequence_length = 0
 
bool use_causal_mask = false
 
Type compute_dtype = Type::FP32
 
MatrixR causal_mask
 
DropoutOp dropout
 
size_t source_view_index = 1
 
vector< size_t > scratch_slots
 
vector< size_t > attention_output_slots
 
- Public Attributes inherited from opennn::Operator
vector< size_t > input_slots = {0}
 
vector< size_t > output_slots = {1}
 
vector< size_t > input_delta_slots = {1}
 
vector< size_t > output_delta_slots = {0}
 

Detailed Description

Scaled dot-product attention with optional causal mask and dropout.

Constructor & Destructor Documentation

◆ AttentionOp() [1/3]

opennn::AttentionOp::AttentionOp ( )

◆ ~AttentionOp()

opennn::AttentionOp::~AttentionOp ( )
override

◆ AttentionOp() [2/3]

opennn::AttentionOp::AttentionOp ( AttentionOp && )
noexcept

◆ AttentionOp() [3/3]

opennn::AttentionOp::AttentionOp ( const AttentionOp & )
delete

Member Function Documentation

◆ apply()

void opennn::AttentionOp::apply ( const TensorView & query,
const TensorView & key,
const TensorView & value,
const TensorView & source_input,
TensorView & attention_weights,
TensorView & attention_weights_dropped,
TensorView & output,
float * mask_scratch,
bool is_training )

Computes attention output from Q, K, V; applies softmax, mask and dropout in place.

Parameters
queryQuery tensor (batch, heads, Q_seq, head_dim).
keyKey tensor (batch, heads, S_seq, head_dim).
valueValue tensor (batch, heads, S_seq, head_dim).
source_inputSource-side input used to build the padding mask.
attention_weightsOutput attention scores (CPU only; empty on GPU).
attention_weights_droppedOptional masked-out scores (CPU only).
outputOutput tensor (batch, heads, Q_seq, head_dim).
mask_scratchScratch buffer for the temporary mask.
is_trainingEnables dropout when true.

◆ apply_delta()

void opennn::AttentionOp::apply_delta ( const TensorView & query,
const TensorView & key,
const TensorView & value,
const TensorView & attention_output,
const TensorView & attention_weights,
const TensorView & attention_weights_dropped,
const TensorView & output_delta,
TensorView & attention_weight_delta,
TensorView & query_delta,
TensorView & key_delta,
TensorView & value_delta ) const

Computes Q/K/V gradients from the output gradient and cached forward activations.

Parameters
queryForward Q tensor.
keyForward K tensor.
valueForward V tensor.
attention_outputForward output tensor (read by GPU SDPA only).
attention_weightsForward attention scores (CPU only).
attention_weights_droppedForward masked-out scores (CPU only).
output_deltaGradient w.r.t. the attention output.
attention_weight_deltaScratch tensor (CPU only).
query_deltaOutput gradient w.r.t. Q.
key_deltaOutput gradient w.r.t. K.
value_deltaOutput gradient w.r.t. V.

◆ back_propagate()

void opennn::AttentionOp::back_propagate ( ForwardPropagation & fp,
BackPropagation & bp,
size_t layer ) const
overridevirtualnoexcept

Runs the operator's backward computation, accumulating into gradient/delta buffers.

Parameters
fpForward propagation workspace (read-only).
bpBack propagation workspace receiving gradients and deltas.
layerIndex of the hosting layer in the workspace.

Reimplemented from opennn::Operator.

◆ destroy_cuda()

void opennn::AttentionOp::destroy_cuda ( )
overridevirtual

Releases CUDA resources owned by the operator; called from destructors.

Reimplemented from opennn::Operator.

◆ forward_propagate()

void opennn::AttentionOp::forward_propagate ( ForwardPropagation & fp,
size_t layer,
bool is_training )
overridevirtualnoexcept

Runs the operator's forward computation.

Parameters
fpForward propagation workspace.
layerIndex of the hosting layer in the workspace.
is_trainingIf true, enables training-only behavior (e.g. dropout sampling).

Reimplemented from opennn::Operator.

◆ forward_scratch_specs()

vector< TensorSpec > opennn::AttentionOp::forward_scratch_specs ( Index batch_size) const

Returns the tensor specs of the forward-pass scratch buffers used by the operator.

◆ from_JSON()

void opennn::AttentionOp::from_JSON ( const Json * )
overridevirtual

Restores the operator configuration from a JSON node.

Reimplemented from opennn::Operator.

◆ operator=() [1/2]

AttentionOp & opennn::AttentionOp::operator= ( AttentionOp && )
noexcept

◆ operator=() [2/2]

AttentionOp & opennn::AttentionOp::operator= ( const AttentionOp & )
delete

◆ set()

void opennn::AttentionOp::set ( Index heads_number,
Index head_dimension,
Index query_sequence_length,
Index source_sequence_length,
bool use_causal_mask,
Type compute_dtype )

Configures the attention geometry and compute precision.

Parameters
heads_numberNumber of attention heads.
head_dimensionPer-head feature size.
query_sequence_lengthLength of the query sequence.
source_sequence_lengthLength of the key/value sequence.
use_causal_maskIf true, applies an upper-triangular causal mask.
compute_dtypeDtype used for the QK and softmax-V matmuls.

◆ set_dropout_rate()

void opennn::AttentionOp::set_dropout_rate ( float rate)
inline

Sets the post-softmax dropout rate (0 disables dropout).

◆ to_JSON()

void opennn::AttentionOp::to_JSON ( JsonWriter & ) const
overridevirtual

Serializes the operator configuration to a JSON writer.

Reimplemented from opennn::Operator.

Member Data Documentation

◆ attention_output_slots

vector<size_t> opennn::AttentionOp::attention_output_slots

◆ causal_mask

MatrixR opennn::AttentionOp::causal_mask

◆ compute_dtype

Type opennn::AttentionOp::compute_dtype = Type::FP32

◆ dropout

DropoutOp opennn::AttentionOp::dropout

◆ head_dimension

Index opennn::AttentionOp::head_dimension = 0

◆ heads_number

Index opennn::AttentionOp::heads_number = 0

◆ query_sequence_length

Index opennn::AttentionOp::query_sequence_length = 0

◆ scratch_slots

vector<size_t> opennn::AttentionOp::scratch_slots

◆ source_sequence_length

Index opennn::AttentionOp::source_sequence_length = 0

◆ source_view_index

size_t opennn::AttentionOp::source_view_index = 1

◆ use_causal_mask

bool opennn::AttentionOp::use_causal_mask = false