Multi-head scaled dot-product attention layer used in transformer architectures.
More...
|
| | MultiHeadAttention (const Shape &=Shape({0, 0}), Index=0, const string &={}) |
| | Constructs a self-attention layer where queries and keys share the same sequence.
|
| |
| | MultiHeadAttention (const Shape &new_query_dimensions, const Shape &new_source_dimensions, Index=0, const string &={}) |
| | Constructs a cross-attention layer with separate query and source (key/value) sequences.
|
| |
| Shape | get_input_shape () const override |
| | Returns the input tensor shape.
|
| |
| Shape | get_output_shape () const override |
| | Returns the output tensor shape.
|
| |
| Index | get_query_sequence_length () const |
| |
| Index | get_source_sequence_length () const |
| |
| Index | get_embedding_dimension () const |
| |
| Index | get_heads_number () const |
| |
| Index | get_head_dimension () const |
| | Returns the per-head dimension (embedding_dimension / heads_number).
|
| |
| Shape | get_heads_shape (Index batch_size) const |
| | Returns the per-head tensor shape used internally during attention.
|
| |
| Shape | get_concat_shape (Index batch_size) const |
| | Returns the shape used when concatenating heads back to the embedding dimension.
|
| |
| vector< TensorSpec > | get_forward_specs (Index batch_size) const override |
| | Returns the tensor specifications used during forward propagation.
|
| |
| vector< TensorSpec > | get_backward_specs (Index batch_size) const override |
| | Returns the tensor specifications used during back propagation.
|
| |
| void | set (Index=0, Index=0, Index=0, Index=0, bool=false, const string &="multihead_attention_layer") |
| | Reconfigures the layer with new sequence, embedding, head sizes and causal flag.
|
| |
| void | set_input_shape (const Shape &) override |
| | Updates the layer for a new input shape.
|
| |
| void | on_compute_dtype_changed () override |
| | Rebuilds projection operators when the compute dtype changes.
|
| |
| void | set_dropout_rate (float new_dropout_rate) |
| | Sets the dropout rate applied to the attention weights.
|
| |
| void | read_JSON_body (const Json *) override |
| | Reads the layer configuration from a JSON node.
|
| |
| void | write_JSON_body (JsonWriter &) const override |
| | Writes the layer configuration to a JSON writer.
|
| |
| virtual | ~Layer ()=default |
| |
| const string & | get_label () const |
| |
| const string & | get_name () const |
| |
| LayerType | get_type () const |
| |
| virtual void | set_output_shape (const Shape &) |
| | Sets the output shape; subclasses override when the output is user-configurable.
|
| |
| void | set_label (string new_label) |
| |
| Index | get_parameters_number () const |
| | Returns the total number of trainable parameters owned by this layer.
|
| |
| const vector< Operator * > & | get_operators () const |
| |
| virtual vector< TensorSpec > | get_parameter_specs () const |
| | Returns the tensor specs of trainable parameters; subclasses override.
|
| |
| virtual vector< TensorSpec > | get_state_specs () const |
| | Returns the tensor specs of persistent state (e.g. running mean/variance).
|
| |
| virtual ActivationOp::Function | get_output_activation () const |
| | Returns the layer's output activation (Identity for most layers; overridden by Dense/Bounding).
|
| |
| Index | get_inputs_number () const |
| |
| Index | get_outputs_number () const |
| |
| virtual void | forward_propagate (ForwardPropagation &fp, size_t layer, bool is_training) noexcept |
| | Runs the forward pass by chaining the layer's operators in order.
|
| |
| virtual void | back_propagate (ForwardPropagation &fp, BackPropagation &bp, size_t i) const noexcept |
| | Runs the backward pass by chaining the layer's operators in reverse order.
|
| |
| virtual void | from_JSON (const JsonDocument &document) |
| | Restores layer configuration and parameters from a JSON document.
|
| |
| virtual void | load_state_from_JSON (const JsonDocument &document) |
| | Restores persistent state (e.g. running statistics) from a JSON document.
|
| |
| virtual void | to_JSON (JsonWriter &writer) const |
| | Serializes layer configuration and parameters to a JSON writer.
|
| |
| virtual string | write_expression (const vector< string > &, const vector< string > &) const |
| | Returns a human-readable mathematical expression for this layer (empty by default).
|
| |
| virtual void | print () const |
| | Prints a human-readable summary of the layer to standard output.
|
| |
| bool | get_is_trainable () const |
| |
| Type | get_compute_dtype () const |
| |
| void | set_compute_dtype (Type new_compute_dtype) |
| | Sets the compute dtype and notifies subclasses via on_compute_dtype_changed().
|
| |
| virtual float * | link_states (float *pointer) |
| | Binds the persistent-state region of the shared buffer to operator views.
|
| |
| float * | link_gradients (float *pointer, vector< TensorView > &gradient_views) |
| | Binds the gradient slice of the shared buffer to operator gradient views.
|
| |
| vector< TensorView > & | get_parameter_views () |
| |
| const vector< TensorView > & | get_parameter_views () const |
| |
| void | redistribute_parameters_to_operators () |
| | Re-binds operator parameter views after the parameter buffer has been resized or moved.
|
| |
Multi-head scaled dot-product attention layer used in transformer architectures.