OpenNN
Open-source neural networks library
Loading...
Searching...
No Matches
opennn::MultiHeadAttention Class Referencefinal

Multi-head scaled dot-product attention layer used in transformer architectures. More...

#include <multihead_attention_layer.h>

Inheritance diagram for opennn::MultiHeadAttention:
[legend]

Public Member Functions

 MultiHeadAttention (const Shape &=Shape({0, 0}), Index=0, const string &={})
 Constructs a self-attention layer where queries and keys share the same sequence.
 
 MultiHeadAttention (const Shape &new_query_dimensions, const Shape &new_source_dimensions, Index=0, const string &={})
 Constructs a cross-attention layer with separate query and source (key/value) sequences.
 
Shape get_input_shape () const override
 Returns the input tensor shape.
 
Shape get_output_shape () const override
 Returns the output tensor shape.
 
Index get_query_sequence_length () const
 
Index get_source_sequence_length () const
 
Index get_embedding_dimension () const
 
Index get_heads_number () const
 
Index get_head_dimension () const
 Returns the per-head dimension (embedding_dimension / heads_number).
 
Shape get_heads_shape (Index batch_size) const
 Returns the per-head tensor shape used internally during attention.
 
Shape get_concat_shape (Index batch_size) const
 Returns the shape used when concatenating heads back to the embedding dimension.
 
vector< TensorSpecget_forward_specs (Index batch_size) const override
 Returns the tensor specifications used during forward propagation.
 
vector< TensorSpecget_backward_specs (Index batch_size) const override
 Returns the tensor specifications used during back propagation.
 
void set (Index=0, Index=0, Index=0, Index=0, bool=false, const string &="multihead_attention_layer")
 Reconfigures the layer with new sequence, embedding, head sizes and causal flag.
 
void set_input_shape (const Shape &) override
 Updates the layer for a new input shape.
 
void on_compute_dtype_changed () override
 Rebuilds projection operators when the compute dtype changes.
 
void set_dropout_rate (float new_dropout_rate)
 Sets the dropout rate applied to the attention weights.
 
void read_JSON_body (const Json *) override
 Reads the layer configuration from a JSON node.
 
void write_JSON_body (JsonWriter &) const override
 Writes the layer configuration to a JSON writer.
 
- Public Member Functions inherited from opennn::Layer
virtual ~Layer ()=default
 
const string & get_label () const
 
const string & get_name () const
 
LayerType get_type () const
 
virtual void set_output_shape (const Shape &)
 Sets the output shape; subclasses override when the output is user-configurable.
 
void set_label (string new_label)
 
Index get_parameters_number () const
 Returns the total number of trainable parameters owned by this layer.
 
const vector< Operator * > & get_operators () const
 
virtual vector< TensorSpecget_parameter_specs () const
 Returns the tensor specs of trainable parameters; subclasses override.
 
virtual vector< TensorSpecget_state_specs () const
 Returns the tensor specs of persistent state (e.g. running mean/variance).
 
virtual ActivationOp::Function get_output_activation () const
 Returns the layer's output activation (Identity for most layers; overridden by Dense/Bounding).
 
Index get_inputs_number () const
 
Index get_outputs_number () const
 
virtual void forward_propagate (ForwardPropagation &fp, size_t layer, bool is_training) noexcept
 Runs the forward pass by chaining the layer's operators in order.
 
virtual void back_propagate (ForwardPropagation &fp, BackPropagation &bp, size_t i) const noexcept
 Runs the backward pass by chaining the layer's operators in reverse order.
 
virtual void from_JSON (const JsonDocument &document)
 Restores layer configuration and parameters from a JSON document.
 
virtual void load_state_from_JSON (const JsonDocument &document)
 Restores persistent state (e.g. running statistics) from a JSON document.
 
virtual void to_JSON (JsonWriter &writer) const
 Serializes layer configuration and parameters to a JSON writer.
 
virtual string write_expression (const vector< string > &, const vector< string > &) const
 Returns a human-readable mathematical expression for this layer (empty by default).
 
virtual void print () const
 Prints a human-readable summary of the layer to standard output.
 
bool get_is_trainable () const
 
Type get_compute_dtype () const
 
void set_compute_dtype (Type new_compute_dtype)
 Sets the compute dtype and notifies subclasses via on_compute_dtype_changed().
 
virtual float * link_states (float *pointer)
 Binds the persistent-state region of the shared buffer to operator views.
 
float * link_gradients (float *pointer, vector< TensorView > &gradient_views)
 Binds the gradient slice of the shared buffer to operator gradient views.
 
vector< TensorView > & get_parameter_views ()
 
const vector< TensorView > & get_parameter_views () const
 
void redistribute_parameters_to_operators ()
 Re-binds operator parameter views after the parameter buffer has been resized or moved.
 

Additional Inherited Members

- Protected Types inherited from opennn::Layer
enum  Forward { Input , Output }
 
enum  Backward { OutputDelta , InputDelta }
 
- Protected Member Functions inherited from opennn::Layer
 Layer ()=default
 
 Layer (LayerType t, bool trainable=true)
 
float * link_views_to_operators (vector< TensorView > &views, float *pointer, vector< TensorSpec >(Operator::*specs_fn)() const, void(Operator::*link_fn)(span< const TensorView >))
 
- Protected Attributes inherited from opennn::Layer
string label = "my_layer"
 
LayerType layer_type = LayerType::Dense
 
bool is_trainable = true
 
Shape input_shape
 
Type compute_dtype = Type::FP32
 
vector< TensorViewparameters
 
vector< TensorViewstates
 
vector< Operator * > operators
 

Detailed Description

Multi-head scaled dot-product attention layer used in transformer architectures.

Constructor & Destructor Documentation

◆ MultiHeadAttention() [1/2]

opennn::MultiHeadAttention::MultiHeadAttention ( const Shape & = Shape({0, 0}),
Index = 0,
const string & = {} )

Constructs a self-attention layer where queries and keys share the same sequence.

Parameters
query_dimensionsQuery input shape as (sequence_length, embedding_dimension).
heads_numberNumber of attention heads.
nameLayer name used for serialization.

◆ MultiHeadAttention() [2/2]

opennn::MultiHeadAttention::MultiHeadAttention ( const Shape & new_query_dimensions,
const Shape & new_source_dimensions,
Index = 0,
const string & = {} )

Constructs a cross-attention layer with separate query and source (key/value) sequences.

Parameters
new_query_dimensionsQuery shape as (query_sequence_length, embedding_dimension).
new_source_dimensionsSource shape as (source_sequence_length, embedding_dimension).
heads_numberNumber of attention heads.
nameLayer name used for serialization.

Member Function Documentation

◆ get_backward_specs()

vector< TensorSpec > opennn::MultiHeadAttention::get_backward_specs ( Index batch_size) const
overridevirtual

Returns the tensor specifications used during back propagation.

Reimplemented from opennn::Layer.

◆ get_concat_shape()

Shape opennn::MultiHeadAttention::get_concat_shape ( Index batch_size) const
inline

Returns the shape used when concatenating heads back to the embedding dimension.

◆ get_embedding_dimension()

Index opennn::MultiHeadAttention::get_embedding_dimension ( ) const
inline

◆ get_forward_specs()

vector< TensorSpec > opennn::MultiHeadAttention::get_forward_specs ( Index batch_size) const
overridevirtual

Returns the tensor specifications used during forward propagation.

Reimplemented from opennn::Layer.

◆ get_head_dimension()

Index opennn::MultiHeadAttention::get_head_dimension ( ) const
inline

Returns the per-head dimension (embedding_dimension / heads_number).

◆ get_heads_number()

Index opennn::MultiHeadAttention::get_heads_number ( ) const
inline

◆ get_heads_shape()

Shape opennn::MultiHeadAttention::get_heads_shape ( Index batch_size) const
inline

Returns the per-head tensor shape used internally during attention.

◆ get_input_shape()

Shape opennn::MultiHeadAttention::get_input_shape ( ) const
overridevirtual

Returns the input tensor shape.

Reimplemented from opennn::Layer.

◆ get_output_shape()

Shape opennn::MultiHeadAttention::get_output_shape ( ) const
overridevirtual

Returns the output tensor shape.

Implements opennn::Layer.

◆ get_query_sequence_length()

Index opennn::MultiHeadAttention::get_query_sequence_length ( ) const
inline

◆ get_source_sequence_length()

Index opennn::MultiHeadAttention::get_source_sequence_length ( ) const
inline

◆ on_compute_dtype_changed()

void opennn::MultiHeadAttention::on_compute_dtype_changed ( )
overridevirtual

Rebuilds projection operators when the compute dtype changes.

Reimplemented from opennn::Layer.

◆ read_JSON_body()

void opennn::MultiHeadAttention::read_JSON_body ( const Json * )
overridevirtual

Reads the layer configuration from a JSON node.

Reimplemented from opennn::Layer.

◆ set()

void opennn::MultiHeadAttention::set ( Index = 0,
Index = 0,
Index = 0,
Index = 0,
bool = false,
const string & = "multihead_attention_layer" )

Reconfigures the layer with new sequence, embedding, head sizes and causal flag.

◆ set_dropout_rate()

void opennn::MultiHeadAttention::set_dropout_rate ( float new_dropout_rate)
inline

Sets the dropout rate applied to the attention weights.

◆ set_input_shape()

void opennn::MultiHeadAttention::set_input_shape ( const Shape & )
overridevirtual

Updates the layer for a new input shape.

Reimplemented from opennn::Layer.

◆ write_JSON_body()

void opennn::MultiHeadAttention::write_JSON_body ( JsonWriter & ) const
overridevirtual

Writes the layer configuration to a JSON writer.

Reimplemented from opennn::Layer.