Scaled dot-product attention with optional causal mask and dropout. More...

#include <operators.h>

Inheritance diagram for opennn::AttentionOp:

Public Member Functions
void	set (Index heads_number, Index head_dimension, Index query_sequence_length, Index source_sequence_length, bool use_causal_mask, Type compute_dtype)
	Configures the attention geometry and compute precision.

void	set_dropout_rate (float rate)
	Sets the post-softmax dropout rate (0 disables dropout).

vector< TensorSpec >	forward_scratch_specs (Index batch_size) const
	Returns the tensor specs of the forward-pass scratch buffers used by the operator.

void	forward_propagate (ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
	Runs the operator's forward computation.

void	back_propagate (ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
	Runs the operator's backward computation, accumulating into gradient/delta buffers.

void	apply (const TensorView &query, const TensorView &key, const TensorView &value, const TensorView &source_input, TensorView &attention_weights, TensorView &attention_weights_dropped, TensorView &output, float *mask_scratch, bool is_training)
	Computes attention output from Q, K, V; applies softmax, mask and dropout in place.

void	apply_delta (const TensorView &query, const TensorView &key, const TensorView &value, const TensorView &attention_output, const TensorView &attention_weights, const TensorView &attention_weights_dropped, const TensorView &output_delta, TensorView &attention_weight_delta, TensorView &query_delta, TensorView &key_delta, TensorView &value_delta) const
	Computes Q/K/V gradients from the output gradient and cached forward activations.

void	to_JSON (JsonWriter &w) const override
	Serializes the operator configuration to a JSON writer.

void	from_JSON (const Json *parent) override
	Restores the operator configuration from a JSON node.

void	destroy_cuda () override
	Releases CUDA resources owned by the operator; called from destructors.

	AttentionOp ()

	~AttentionOp () override

	AttentionOp (AttentionOp &&) noexcept

AttentionOp &	operator= (AttentionOp &&) noexcept

	AttentionOp (const AttentionOp &)=delete

AttentionOp &	operator= (const AttentionOp &)=delete

Public Member Functions inherited from opennn::Operator
virtual	~Operator ()=default

virtual vector< TensorSpec >	parameter_specs () const
	Returns the tensor specs of trainable parameters owned by this operator.

virtual vector< TensorSpec >	state_specs () const
	Returns the tensor specs of persistent state owned by this operator.

virtual void	link_parameters (span< const TensorView >)
	Binds parameter views provided by the hosting layer.

virtual void	link_gradients (span< const TensorView >)
	Binds gradient views provided by the hosting layer.

virtual void	link_states (span< const TensorView >)
	Binds state views provided by the hosting layer.

virtual void	set_parameters_random ()
	Initializes parameters with random values.

virtual void	set_parameters_glorot ()
	Initializes parameters using Glorot (Xavier) initialization.

virtual void	load_state_from_JSON (const Json *)
	Restores persistent state (e.g. running statistics) from a JSON node.

TensorView &	get_input (ForwardPropagation &fp, size_t layer, size_t i=0) const noexcept

vector< TensorView > &	get_inputs (ForwardPropagation &fp, size_t layer, size_t i=0) const noexcept

TensorView &	get_output (ForwardPropagation &fp, size_t layer, size_t i=0) const noexcept

TensorView &	get_output_delta (BackPropagation &bp, size_t layer, size_t i=0) const noexcept

TensorView &	get_input_delta (BackPropagation &bp, size_t layer, size_t i=0) const noexcept

Public Attributes
Index	heads_number = 0

Index	head_dimension = 0

Index	query_sequence_length = 0

Index	source_sequence_length = 0

bool	use_causal_mask = false

Type	compute_dtype = Type::FP32

MatrixR	causal_mask

DropoutOp	dropout

size_t	source_view_index = 1

vector< size_t >	scratch_slots

vector< size_t >	attention_output_slots

Public Attributes inherited from opennn::Operator
vector< size_t >	input_slots = {0}

vector< size_t >	output_slots = {1}

vector< size_t >	input_delta_slots = {1}

vector< size_t >	output_delta_slots = {0}

Detailed Description

Scaled dot-product attention with optional causal mask and dropout.

Constructor & Destructor Documentation

◆ AttentionOp() [1/3]

opennn::AttentionOp::AttentionOp ( )

◆ ~AttentionOp()

opennn::AttentionOp::~AttentionOp ( )

override

◆ AttentionOp() [2/3]

opennn::AttentionOp::AttentionOp ( AttentionOp && )

noexcept

◆ AttentionOp() [3/3]

opennn::AttentionOp::AttentionOp ( const AttentionOp & )

delete

Member Function Documentation

◆ apply()

void opennn::AttentionOp::apply	(	const TensorView &	query,
		const TensorView &	key,
		const TensorView &	value,
		const TensorView &	source_input,
		TensorView &	attention_weights,
		TensorView &	attention_weights_dropped,
		TensorView &	output,
		float *	mask_scratch,
		bool	is_training )

Computes attention output from Q, K, V; applies softmax, mask and dropout in place.

Parameters

query	Query tensor (batch, heads, Q_seq, head_dim).
key	Key tensor (batch, heads, S_seq, head_dim).
value	Value tensor (batch, heads, S_seq, head_dim).
source_input	Source-side input used to build the padding mask.
attention_weights	Output attention scores (CPU only; empty on GPU).
attention_weights_dropped	Optional masked-out scores (CPU only).
output	Output tensor (batch, heads, Q_seq, head_dim).
mask_scratch	Scratch buffer for the temporary mask.
is_training	Enables dropout when true.

◆ apply_delta()

void opennn::AttentionOp::apply_delta	(	const TensorView &	query,
		const TensorView &	key,
		const TensorView &	value,
		const TensorView &	attention_output,
		const TensorView &	attention_weights,
		const TensorView &	attention_weights_dropped,
		const TensorView &	output_delta,
		TensorView &	attention_weight_delta,
		TensorView &	query_delta,
		TensorView &	key_delta,
		TensorView &	value_delta ) const

Computes Q/K/V gradients from the output gradient and cached forward activations.

Parameters

query	Forward Q tensor.
key	Forward K tensor.
value	Forward V tensor.
attention_output	Forward output tensor (read by GPU SDPA only).
attention_weights	Forward attention scores (CPU only).
attention_weights_dropped	Forward masked-out scores (CPU only).
output_delta	Gradient w.r.t. the attention output.
attention_weight_delta	Scratch tensor (CPU only).
query_delta	Output gradient w.r.t. Q.
key_delta	Output gradient w.r.t. K.
value_delta	Output gradient w.r.t. V.

◆ back_propagate()

void opennn::AttentionOp::back_propagate	(	ForwardPropagation &	fp,
		BackPropagation &	bp,
		size_t	layer ) const

overridevirtualnoexcept

Runs the operator's backward computation, accumulating into gradient/delta buffers.

Parameters

fp	Forward propagation workspace (read-only).
bp	Back propagation workspace receiving gradients and deltas.
layer	Index of the hosting layer in the workspace.

Reimplemented from opennn::Operator.

◆ destroy_cuda()

void opennn::AttentionOp::destroy_cuda ( )

overridevirtual

Releases CUDA resources owned by the operator; called from destructors.

Reimplemented from opennn::Operator.

◆ forward_propagate()

void opennn::AttentionOp::forward_propagate	(	ForwardPropagation &	fp,
		size_t	layer,
		bool	is_training )

overridevirtualnoexcept

Runs the operator's forward computation.

Parameters

fp	Forward propagation workspace.
layer	Index of the hosting layer in the workspace.
is_training	If true, enables training-only behavior (e.g. dropout sampling).

Reimplemented from opennn::Operator.

◆ forward_scratch_specs()

vector< TensorSpec > opennn::AttentionOp::forward_scratch_specs ( Index batch_size ) const

Returns the tensor specs of the forward-pass scratch buffers used by the operator.

◆ from_JSON()

void opennn::AttentionOp::from_JSON ( const Json * )

overridevirtual

Restores the operator configuration from a JSON node.

Reimplemented from opennn::Operator.

◆ operator=() [1/2]

AttentionOp & opennn::AttentionOp::operator= ( AttentionOp && )

noexcept

◆ operator=() [2/2]

AttentionOp & opennn::AttentionOp::operator= ( const AttentionOp & )

delete

◆ set()

void opennn::AttentionOp::set	(	Index	heads_number,
		Index	head_dimension,
		Index	query_sequence_length,
		Index	source_sequence_length,
		bool	use_causal_mask,
		Type	compute_dtype )

Configures the attention geometry and compute precision.

Parameters

heads_number	Number of attention heads.
head_dimension	Per-head feature size.
query_sequence_length	Length of the query sequence.
source_sequence_length	Length of the key/value sequence.
use_causal_mask	If true, applies an upper-triangular causal mask.
compute_dtype	Dtype used for the QK and softmax-V matmuls.

◆ set_dropout_rate()

void opennn::AttentionOp::set_dropout_rate ( float rate )

inline

Sets the post-softmax dropout rate (0 disables dropout).

◆ to_JSON()

void opennn::AttentionOp::to_JSON ( JsonWriter & ) const

overridevirtual

Serializes the operator configuration to a JSON writer.

Reimplemented from opennn::Operator.

Member Data Documentation

◆ attention_output_slots

vector<size_t> opennn::AttentionOp::attention_output_slots

◆ causal_mask

MatrixR opennn::AttentionOp::causal_mask

◆ compute_dtype

Type opennn::AttentionOp::compute_dtype = Type::FP32

◆ dropout

DropoutOp opennn::AttentionOp::dropout

◆ head_dimension

Index opennn::AttentionOp::head_dimension = 0

◆ heads_number

Index opennn::AttentionOp::heads_number = 0

◆ query_sequence_length

Index opennn::AttentionOp::query_sequence_length = 0

◆ scratch_slots

vector<size_t> opennn::AttentionOp::scratch_slots

◆ source_sequence_length

Index opennn::AttentionOp::source_sequence_length = 0

◆ source_view_index

size_t opennn::AttentionOp::source_view_index = 1

◆ use_causal_mask

bool opennn::AttentionOp::use_causal_mask = false

Public Member Functions

Public Attributes

Detailed Description

Constructor & Destructor Documentation

◆ AttentionOp() [1/3]

◆ ~AttentionOp()

◆ AttentionOp() [2/3]

◆ AttentionOp() [3/3]

Member Function Documentation

◆ apply()

◆ apply_delta()

◆ back_propagate()

◆ destroy_cuda()

◆ forward_propagate()

◆ forward_scratch_specs()

◆ from_JSON()

◆ operator=() [1/2]

◆ operator=() [2/2]

◆ set()

◆ set_dropout_rate()

◆ to_JSON()

Member Data Documentation

◆ attention_output_slots

◆ causal_mask

◆ compute_dtype

◆ dropout

◆ head_dimension

◆ heads_number

◆ query_sequence_length

◆ scratch_slots

◆ source_sequence_length

◆ source_view_index

◆ use_causal_mask