OpenNN
Open-source neural networks library
Loading...
Searching...
No Matches
operators.h
Go to the documentation of this file.
1// OpenNN: Open Neural Networks Library
2// www.opennn.net
3//
4// O P E R A T O R S H E A D E R
5//
6// Artificial Intelligence Techniques SL
7// artelnics@artelnics.com
8
9#pragma once
10
11#include "tensor_utilities.h"
12#include "enum_map.h"
13#include "forward_propagation.h"
14#include "back_propagation.h"
15
16namespace opennn
17{
18
19class Json;
20class JsonWriter;
21
22#ifdef OPENNN_HAS_CUDA
23struct LtMatmulPlan;
24#endif
25
28{
29 virtual ~Operator() = default;
30
32 virtual vector<TensorSpec> parameter_specs() const { return {}; }
33
35 virtual vector<TensorSpec> state_specs() const { return {}; }
36
38 virtual void link_parameters(span<const TensorView>) {}
39
41 virtual void link_gradients (span<const TensorView>) {}
42
44 virtual void link_states (span<const TensorView>) {}
45
47 virtual void set_parameters_random() {}
48
50 virtual void set_parameters_glorot() {}
51
56 virtual void forward_propagate(ForwardPropagation&, size_t, bool) noexcept {}
57
62 virtual void back_propagate(ForwardPropagation&, BackPropagation&, size_t) const noexcept {}
63
65 virtual void to_JSON (JsonWriter&) const {}
66
68 virtual void from_JSON(const Json*) {}
69
71 virtual void load_state_from_JSON(const Json*) {}
72
74 virtual void destroy_cuda() {}
75
76 vector<size_t> input_slots = {0};
77 vector<size_t> output_slots = {1};
78
79 vector<size_t> input_delta_slots = {1};
80 vector<size_t> output_delta_slots = {0};
81
82 TensorView& get_input(ForwardPropagation& fp, size_t layer, size_t i = 0) const noexcept
83 {
84 return fp.views[layer][input_slots[i]][0];
85 }
86
87 vector<TensorView>& get_inputs(ForwardPropagation& fp, size_t layer, size_t i = 0) const noexcept
88 {
89 return fp.views[layer][input_slots[i]];
90 }
91
92 TensorView& get_output(ForwardPropagation& fp, size_t layer, size_t i = 0) const noexcept
93 {
94 return fp.views[layer][output_slots[i]][0];
95 }
96
97 TensorView& get_output_delta(BackPropagation& bp, size_t layer, size_t i = 0) const noexcept
98 {
99 return bp.delta_views[layer][output_delta_slots[i]];
100 }
101
102 TensorView& get_input_delta(BackPropagation& bp, size_t layer, size_t i = 0) const noexcept
103 {
104 return bp.delta_views[layer][input_delta_slots[i]];
105 }
106};
107
110{
112 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
114 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
115
116private:
117 void check(const vector<TensorView>& inputs, const TensorView& output) const;
118};
119
122{
123 float rate = 0.0f;
124
126
127 vector<size_t> save_slots;
128
130 bool active() const { return rate > 0.0f; }
131
133 void set_rate(float new_rate);
134
136 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
138 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
139
141 void apply_cpu(TensorView& output);
142
144 void apply_gpu(TensorView& output);
145
147 void apply_delta(TensorView& delta) const;
148
149 void to_JSON(JsonWriter& w) const override;
150 void from_JSON(const Json* parent) override;
151
152 void destroy_cuda() override;
153
154 ~DropoutOp() override { destroy_cuda(); }
155
156 DropoutOp() = default;
157 DropoutOp(DropoutOp&&) noexcept = default;
158 DropoutOp& operator=(DropoutOp&&) noexcept = default;
159
160private:
161 void apply_delta_cpu(TensorView& delta) const;
162 void apply_delta_gpu(TensorView& delta) const;
163
164 void ensure_mask(Index n);
165};
166
169{
172
174 static const EnumMap<Function>& map();
175
177 static Function from_string(const string& name);
178
180 static const string& to_string(Function function);
181
184
186
188
189 // Backward override: when non-empty, back_propagate reads the activation's
190 // output from this slot instead of output_slots[0]. Used when a downstream
191 // operator (e.g. DropoutOp) overwrites the activation's output in place.
192 vector<size_t> output_slots_backward;
193
195 void set_function(Function new_function);
196
198 void set_function(const string& name);
199
201 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
203 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
204
206 void apply_cpu(TensorView& output);
207
209 void apply_gpu(TensorView& output);
210
212 void apply_delta(const TensorView& outputs, TensorView& delta) const;
213
214 void to_JSON(JsonWriter& w) const override;
215 void from_JSON(const Json* parent) override;
216
217 void destroy_cuda() override;
218
219 ~ActivationOp() override { destroy_cuda(); }
220
221 ActivationOp() = default;
222 ActivationOp(const ActivationOp&) = delete;
224
225private:
226 void apply_delta_cpu(const TensorView& outputs, TensorView& delta) const;
227 void apply_delta_gpu(const TensorView& outputs, TensorView& delta) const;
228};
229
232{
233 Index input_features = 0;
236
239
242
247 void set(Index new_input_features, Index new_output_features, Type new_weight_type = Type::FP32);
248
249 vector<TensorSpec> parameter_specs() const override;
250 void link_parameters(span<const TensorView> views) override;
251 void link_gradients (span<const TensorView> views) override;
252
253 void set_parameters_random() override;
254 void set_parameters_glorot() override;
255
257 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
259 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
260
262 void apply(const TensorView& input, TensorView& output, cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS);
263
269 void apply_delta(const TensorView& output_delta,
270 const TensorView& input,
271 TensorView& input_delta,
272 bool accumulate_input_delta = false) const;
273
274private:
275 void apply_cpu(const TensorView& input, TensorView& output, cublasLtEpilogue_t epilogue);
276 void apply_gpu(const TensorView& input, TensorView& output, cublasLtEpilogue_t epilogue);
277
278 void apply_delta_cpu(const TensorView& output_delta, const TensorView& input,
279 TensorView& input_delta, bool accumulate_input_delta) const;
280 void apply_delta_gpu(const TensorView& output_delta, const TensorView& input,
281 TensorView& input_delta, bool accumulate_input_delta) const;
282};
283
286{
289
291 void set(Index input_features, Index output_features, Type weight_type = Type::FP32);
292
293 vector<TensorSpec> parameter_specs() const override { return combination.parameter_specs(); }
294 void link_parameters(span<const TensorView> views) override { combination.link_parameters(views); }
295 void link_gradients (span<const TensorView> views) override { combination.link_gradients(views); }
296
297 void set_parameters_random() override { combination.set_parameters_random(); }
298 void set_parameters_glorot() override { combination.set_parameters_glorot(); }
299
301 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
303 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
304};
305
308{
309 Index features = 0;
310 float momentum = 0.1f;
311
316
319
321 bool active() const { return features > 0; }
322
326 void set(Index new_features, float new_momentum = 0.1f);
327
328 vector<TensorSpec> parameter_specs() const override;
329 vector<TensorSpec> state_specs() const override;
330 void link_parameters(span<const TensorView> views) override;
331 void link_gradients (span<const TensorView> views) override;
332 void link_states (span<const TensorView> views) override;
333
336
339
340 // Slot convention (set by hosting layer):
341 // input_slots = {input}
342 // output_slots = {output, mean, inverse_variance}
344 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
346 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
347
353 void apply_delta(const TensorView& input,
354 const TensorView& mean,
355 const TensorView& inverse_variance,
356 TensorView& delta) const;
357
360
362 void invalidate_inference_cache() { inference_cache_dirty = true; }
363
364 void to_JSON(JsonWriter& w) const override;
365 void from_JSON(const Json* parent) override;
366 void load_state_from_JSON(const Json* parent) override;
367
368private:
369 VectorR inference_scale;
370 VectorR inference_shift;
371 bool inference_cache_dirty = true;
372
373 mutable VectorR delta_scale_scratch;
374
375 void apply_inference_cpu(const TensorView& input, TensorView& output);
376 void apply_inference_gpu(const TensorView& input, TensorView& output);
377
378 void apply_training_cpu (const TensorView& input,
379 TensorView& mean, TensorView& inverse_variance,
380 TensorView& output);
381 void apply_training_gpu (const TensorView& input,
382 TensorView& mean, TensorView& inverse_variance,
383 TensorView& output);
384
385 void apply_delta_cpu(const TensorView& input,
386 const TensorView& mean,
387 const TensorView& inverse_variance,
388 TensorView& delta) const;
389 void apply_delta_gpu(const TensorView& input,
390 const TensorView& mean,
391 const TensorView& inverse_variance,
392 TensorView& delta) const;
393};
394
397{
398 Index input_height = 0;
399 Index input_width = 0;
400
401 Index kernels_number = 0;
402 Index kernel_height = 0;
403 Index kernel_width = 0;
405
406 Index padding_height = 0;
407 Index padding_width = 0;
408
410
413
416
417#ifdef OPENNN_HAS_CUDA
418 cudnnFilterDescriptor_t kernel_descriptor = nullptr;
419 cudnnConvolutionDescriptor_t convolution_descriptor = nullptr;
420
424
425 size_t cudnn_workspace_size_ = 0;
426
427 // High-water-mark of the batch size for which the cuDNN plan
428 // (algorithms + workspaces) is currently valid. Lazy-initialized on the
429 // first apply_gpu and re-tuned only if a larger batch arrives (e.g. test
430 // batch larger than training).
431 Index planned_batch_size = 0;
432#endif
433
446 void set(Index input_h, Index input_w,
447 Index kernels_n, Index kernel_h, Index kernel_w, Index kernel_c,
448 Index row_stride, Index column_stride,
449 Index padding_h, Index padding_w,
451
452 vector<TensorSpec> parameter_specs() const override;
453 void link_parameters(span<const TensorView> views) override;
454 void link_gradients (span<const TensorView> views) override;
455
456 void set_parameters_random() override;
457 void set_parameters_glorot() override;
458
459 void destroy_cuda() override;
460
461 ~ConvolutionOp() override { destroy_cuda(); }
462
463 ConvolutionOp() = default;
464 ConvolutionOp(const ConvolutionOp&) = delete;
466
468 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
470 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
471
473 void apply_cpu(const TensorView& input, TensorView& output);
474
476 void apply_gpu(const TensorView& input, TensorView& output, cudnnActivationDescriptor_t fused_activation = nullptr);
477
479 void apply_delta(const TensorView& input,
480 const TensorView& output_delta,
481 TensorView& input_delta) const;
482
483private:
484
485 void apply_delta_cpu(const TensorView& input, const TensorView& output_delta,
486 TensorView& input_delta) const;
487 void apply_delta_gpu(const TensorView& input, const TensorView& output_delta,
488 TensorView& input_delta) const;
489
490 void plan_convolution_algorithms(const TensorView& input, const TensorView& output);
491
492 array<pair<Index, Index>, 4> nhwc_padding() const;
493};
494
497{
500
502 void set(Index input_h, Index input_w,
503 Index kernels_n, Index kernel_h, Index kernel_w, Index kernel_c,
504 Index row_stride, Index column_stride,
505 Index padding_h, Index padding_w,
506 Type compute_dtype);
507
508 vector<TensorSpec> parameter_specs() const override { return convolution.parameter_specs(); }
509 void link_parameters(span<const TensorView> views) override { convolution.link_parameters(views); }
510 void link_gradients (span<const TensorView> views) override { convolution.link_gradients(views); }
511
512 void set_parameters_random() override { convolution.set_parameters_random(); }
513 void set_parameters_glorot() override { convolution.set_parameters_glorot(); }
514
515 void destroy_cuda() override { convolution.destroy_cuda(); activation.destroy_cuda(); }
517
518 ConvolutionReluOp() = default;
521
523 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
525 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
526};
527
530{
533
536
539
542
543 vector<TensorSpec> parameter_specs() const override;
544 void link_parameters(span<const TensorView> views) override;
545 void link_gradients (span<const TensorView> views) override;
546
549
552
554 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
556 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
557
558private:
559 void apply_cpu(const TensorView& input,
560 TensorView& means, TensorView& standard_deviations, TensorView& normalized,
561 TensorView& output);
562
563 void apply_gpu(const TensorView& input,
564 TensorView& means, TensorView& standard_deviations,
565 TensorView& output);
566
567 void apply_delta_cpu(const TensorView& output_delta,
568 const TensorView& standard_deviations,
569 const TensorView& normalized,
570 TensorView& input_delta) const;
571 void apply_delta_gpu(const TensorView& input,
572 const TensorView& output_delta,
573 const TensorView& means, const TensorView& standard_deviations,
574 TensorView& input_delta) const;
575};
576
579{
581 Index input_features = 0;
582 Index heads_number = 0;
583 Index head_dimension = 0;
585
586 // Which view inside views[input_slots[0]] to read. 0 for query path, 1 for
587 // source path; clamped to size()-1 so self-attention (single input view)
588 // works regardless.
590
591 // Slot holding the shared transpose-scratch buffer.
592 vector<size_t> scratch_slots;
593
594 // Backward configuration. Self vs cross-attention is detected per-call from
595 // forward_views[input_slots[0]].size() (1 = self, 2 = cross). The two pairs
596 // below select destination slot and accumulate flag for each mode.
601
608
609 vector<TensorSpec> parameter_specs() const override { return combination.parameter_specs(); }
610 void link_parameters(span<const TensorView> views) override { combination.link_parameters(views); }
611 void link_gradients (span<const TensorView> views) override { combination.link_gradients(views); }
612
613 void set_parameters_random() override { combination.set_parameters_random(); }
614 void set_parameters_glorot() override { combination.set_parameters_glorot(); }
615
617 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
619 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
620
625 void apply(const TensorView& input, TensorView& head_output, float* scratch);
626
633 void apply_delta(const TensorView& head_delta,
634 const TensorView& input,
635 TensorView& input_delta,
636 bool accumulate,
637 float* scratch) const;
638};
639
642{
643 Index heads_number = 0;
644 Index head_dimension = 0;
647 bool use_causal_mask = false;
649
651
653
661 void set(Index heads_number, Index head_dimension,
664
666 void set_dropout_rate(float rate) { dropout.set_rate(rate); }
667
669 vector<TensorSpec> forward_scratch_specs(Index batch_size) const;
670
671 // Slot convention (set by hosting layer):
672 // input_slots = {Query, Key, Value, Input} (Input read via source_view_index)
673 // output_slots = {AttentionWeights, AttentionWeightsDropped}
674 // scratch_slots = {TransposeScratch} (used as attention_out + mask_scratch)
675 // attention_output_slots = {ConcatenatedAttentionOutputs} (backward-only: merged output for SDPA)
676 // output_delta_slots = {AttentionWeightDelta, InputQueryDelta, InputSourceDelta, ValueDelta}
677 size_t source_view_index = 1; // 1 = source path; clamped to size()-1 for self-attention
678
679 vector<size_t> scratch_slots;
681
683 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
685 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
686
697 void apply(const TensorView& query, // {B, H, Q_seq, D}
698 const TensorView& key, // {B, H, S_seq, D}
699 const TensorView& value, // {B, H, S_seq, D}
700 const TensorView& source_input, // {B, S_seq, embed} for padding mask
701 TensorView& attention_weights, // {B, H, Q_seq, S_seq} on CPU; empty on GPU
702 TensorView& attention_weights_dropped, // CPU-only, optional
703 TensorView& output, // {B, H, Q_seq, D}
704 float* mask_scratch,
705 bool is_training);
706
719 void apply_delta(const TensorView& query,
720 const TensorView& key,
721 const TensorView& value,
722 const TensorView& attention_output, // forward output O — only read by GPU SDPA
723 const TensorView& attention_weights,
724 const TensorView& attention_weights_dropped,
725 const TensorView& output_delta, // {B, H, Q_seq, D}
726 TensorView& attention_weight_delta, // CPU-only scratch; empty on GPU
727 TensorView& query_delta,
728 TensorView& key_delta,
729 TensorView& value_delta) const;
730
731 void to_JSON(JsonWriter& w) const override;
732 void from_JSON(const Json* parent) override;
733
734 void destroy_cuda() override;
735
737 ~AttentionOp() override;
739 AttentionOp& operator=(AttentionOp&&) noexcept;
740 AttentionOp(const AttentionOp&) = delete;
741 AttentionOp& operator=(const AttentionOp&) = delete;
742
743 struct SDPACache;
744
745private:
746 float scaling_factor() const;
747
748 void apply_cpu(const TensorView& query,
749 const TensorView& key,
750 const TensorView& value,
751 const TensorView& source_input,
752 TensorView& attention_weights,
753 TensorView& attention_weights_dropped,
754 TensorView& output,
755 float* mask_scratch,
756 bool is_training);
757
758 void apply_gpu(const TensorView& query,
759 const TensorView& key,
760 const TensorView& value,
761 const TensorView& source_input,
762 TensorView& attention_weights,
763 TensorView& attention_weights_dropped,
764 TensorView& output,
765 float* mask_scratch,
766 bool is_training);
767
768 // Same signature as apply_delta(); CPU path ignores attention_output.
769 void apply_delta_cpu(const TensorView& query,
770 const TensorView& key,
771 const TensorView& value,
772 const TensorView& attention_output,
773 const TensorView& attention_weights,
774 const TensorView& attention_weights_dropped,
775 const TensorView& output_delta,
776 TensorView& attention_weight_delta,
777 TensorView& query_delta,
778 TensorView& key_delta,
779 TensorView& value_delta) const;
780
781 void apply_delta_gpu(const TensorView& query,
782 const TensorView& key,
783 const TensorView& value,
784 const TensorView& attention_output,
785 const TensorView& attention_weights,
786 const TensorView& attention_weights_dropped,
787 const TensorView& output_delta,
788 TensorView& attention_weight_delta,
789 TensorView& query_delta,
790 TensorView& key_delta,
791 TensorView& value_delta) const;
792
793 void apply_delta_gpu_unfused(const TensorView& query,
794 const TensorView& key,
795 const TensorView& value,
796 const TensorView& attention_weights,
797 const TensorView& attention_weights_dropped,
798 const TensorView& output_delta,
799 TensorView& attention_weight_delta,
800 TensorView& query_delta,
801 TensorView& key_delta,
802 TensorView& value_delta) const;
803
804 static bool get_contiguous_source_lengths(const TensorView& source_input,
805 vector<Index>& lengths,
806 bool& has_padding);
807 static void softmax_rows_prefix(float* matrix, Index rows, Index cols, Index length);
808 static Index infer_attention_prefix_length(const TensorView& attention_weights,
809 Index batch_index);
810
811 // Common backbone for the unfused CPU and GPU paths. The softmax-backward
812 // step differs (Eigen vs cuDNN) and is supplied as a callable.
813 template<typename SoftmaxBwd>
814 void apply_delta_unfused(const TensorView& query,
815 const TensorView& key,
816 const TensorView& value,
817 const TensorView& attention_weights,
818 const TensorView& attention_weights_dropped,
819 const TensorView& output_delta,
820 TensorView& attention_weight_delta,
821 TensorView& query_delta,
822 TensorView& key_delta,
823 TensorView& value_delta,
824 SoftmaxBwd&& softmax_bwd) const;
825
826 mutable unique_ptr<SDPACache> sdpa_cache;
827
828 // SDPA dropout RNG state. Forward advances `sdpa_dropout_offset`; backward
829 // replays the previous step's offset via `sdpa_last_used_offset` so the
830 // dropout mask is reproduced. Seed is fixed per AttentionOp instance.
831 uint64_t sdpa_dropout_seed = 0x9E3779B97F4A7C15ULL;
832 uint64_t sdpa_dropout_offset = 0;
833 mutable uint64_t sdpa_last_used_offset = 0;
834};
835
837// Forward = merge_heads; the layer hosts the shape configuration via set().
839{
840 Index heads_number = 0;
842 Index head_dimension = 0;
844
847
849 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
850
851 // Note: writes the heads gradient back to the SAME forward slot it reads from in
852 // forward (input_slots[0]). Buffer reuse for memory efficiency — the next backward
853 // op (AttentionOp) consumes the heads gradient from that scratch slot.
855 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
856};
857
860{
862 enum Method { Max, Average };
863
864 Index input_height = 0;
865 Index input_width = 0;
866 Index input_channels = 0;
867
868 Index pool_height = 1;
869 Index pool_width = 1;
870 Index row_stride = 1;
871 Index column_stride = 1;
872 Index padding_height = 0;
873 Index padding_width = 0;
874
876
877#ifdef OPENNN_HAS_CUDA
878 cudnnPoolingDescriptor_t pooling_descriptor = nullptr;
879#endif
880
892 void set(Index input_h, Index input_w, Index input_c,
893 Index pool_h, Index pool_w,
894 Index row_stride, Index column_stride,
895 Index padding_h, Index padding_w,
896 Method method);
897
898 void destroy_cuda() override;
899
900 ~PoolOp() override { destroy_cuda(); }
901
902 PoolOp() = default;
903 PoolOp(const PoolOp&) = delete;
904 PoolOp& operator=(const PoolOp&) = delete;
905
906 // Slot convention (set by Pooling layer in update_pool_operator):
907 // input_slots = {Input}
908 // output_slots = {Output, MaximalIndices} for MaxPooling
909 // output_slots = {Output} for AveragePooling
911 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
913 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
914
915private:
916 void apply_cpu(const TensorView& input, TensorView& output, TensorView& maximal_indices, bool is_training);
917 void apply_gpu(const TensorView& input, TensorView& output);
918
919 void apply_delta_cpu(const TensorView& output_delta,
921 TensorView& input_delta) const;
922 void apply_delta_gpu(const TensorView& input,
923 const TensorView& output,
924 const TensorView& output_delta,
925 TensorView& input_delta) const;
926};
927
930{
932 enum Method { Max, Average };
934
935 // Slot convention (set by Pooling3d layer):
936 // input_slots = {Input}
937 // output_slots = {Output, MaximalIndices}
938 // For AveragePooling, MaximalIndices is allocated empty (Shape{}).
940 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
942 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
943};
944
947{
951
952 bool scale_embedding = false;
954
955 float embedding_scale = 1.0f;
956
959
961
966 void set(Index new_vocabulary_size, Index new_sequence_length, Index new_embedding_dimension);
967
968 vector<TensorSpec> parameter_specs() const override;
969 vector<TensorSpec> state_specs() const override;
970 void link_parameters(span<const TensorView> views) override;
971 void link_gradients (span<const TensorView> views) override;
972 void link_states (span<const TensorView> views) override;
973
974 void set_parameters_random() override;
975 void set_parameters_glorot() override;
976
979
981 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
983 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
984
985private:
986 void apply_cpu(const TensorView& indices, TensorView& output);
987 void apply_gpu(const TensorView& indices, TensorView& output);
988
989 void apply_delta_cpu(const TensorView& indices, const TensorView& output_delta) const;
990 void apply_delta_gpu(const TensorView& indices, const TensorView& output_delta) const;
991};
992
995{
997 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
999 void back_propagate(ForwardPropagation& fp, BackPropagation& bp, size_t layer) const noexcept override;
1000};
1001
1004{
1006 enum class Method { NoBounding, Bounding };
1007
1009
1012
1014 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
1015};
1016
1019{
1020 float min_range = -1.0f;
1021 float max_range = 1.0f;
1022
1028
1030 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
1031};
1032
1035{
1036 float min_range = -1.0f;
1037 float max_range = 1.0f;
1038
1044
1046 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
1047};
1048
1049}
1050
1051// OpenNN: Open Neural Networks Library.
1052// Copyright(C) 2005-2026 Artificial Intelligence Techniques, SL.
1053// Licensed under the GNU Lesser General Public License v2.1 or later.
Definition json.h:85
Definition json.h:23
Definition adaptive_moment_estimation.h:14
float mean(const VectorR &)
Arithmetic mean of a vector, ignoring NaNs.
Eigen::array< T, N > array
Definition tensor_utilities.h:471
VectorI maximal_indices(const VectorR &, Index)
Indices of the n largest elements of a vector.
Type
Numeric precision used for training or inference tensors.
Definition configuration.h:20
@ FP32
Definition configuration.h:20
Matrix< float, Dynamic, 1 > VectorR
Definition pch.h:181
cudnnConvolutionFwdAlgo_t
Definition pch.h:103
@ CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
Definition pch.h:103
void * cudnnFilterDescriptor_t
Definition pch.h:110
cudnnActivationMode_t
Definition pch.h:100
cublasLtEpilogue_t
Definition pch.h:96
@ CUBLASLT_EPILOGUE_BIAS
Definition pch.h:96
void * cudnnPoolingDescriptor_t
Definition pch.h:112
cudnnConvolutionBwdDataAlgo_t
Definition pch.h:104
@ CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
Definition pch.h:104
Matrix< float, Dynamic, Dynamic, Layout > MatrixR
Definition pch.h:177
void * cudnnActivationDescriptor_t
Definition pch.h:113
void * cudnnConvolutionDescriptor_t
Definition pch.h:111
cudnnConvolutionBwdFilterAlgo_t
Definition pch.h:105
@ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
Definition pch.h:105
Element-wise non-linear activation (Identity, Sigmoid, Tanh, ReLU, Softmax).
Definition operators.h:169
void apply_delta(const TensorView &outputs, TensorView &delta) const
Multiplies delta by the derivative of the activation evaluated at outputs.
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void to_JSON(JsonWriter &w) const override
Serializes the operator configuration to a JSON writer.
static const string & to_string(Function function)
Returns the string name of a Function.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void destroy_cuda() override
Releases CUDA resources owned by the operator; called from destructors.
vector< size_t > output_slots_backward
Definition operators.h:192
void apply_gpu(TensorView &output)
GPU forward implementation; applies the activation in place on output.
Function
Supported activation functions.
Definition operators.h:171
@ Sigmoid
Definition operators.h:171
@ Softmax
Definition operators.h:171
@ Identity
Definition operators.h:171
@ Tanh
Definition operators.h:171
@ ReLU
Definition operators.h:171
void set_function(Function new_function)
Selects the activation function and configures cuDNN descriptors.
cudnnActivationDescriptor_t descriptor
Definition operators.h:187
ActivationOp(const ActivationOp &)=delete
Function function
Definition operators.h:185
static const EnumMap< Function > & map()
Returns the bidirectional mapping between Function values and their string names.
void set_function(const string &name)
Selects the activation function by name (delegates to set_function(Function)).
ActivationOp & operator=(const ActivationOp &)=delete
static cudnnActivationMode_t to_cudnn_mode(Function function)
Returns the cuDNN activation mode corresponding to a Function.
static Function from_string(const string &name)
Returns the Function corresponding to a string name.
void apply_cpu(TensorView &output)
CPU forward implementation; applies the activation in place on output.
~ActivationOp() override
Definition operators.h:219
void from_JSON(const Json *parent) override
Restores the operator configuration from a JSON node.
Element-wise sum of several input tensors (used by residual connections).
Definition operators.h:110
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
~AttentionOp() override
vector< TensorSpec > forward_scratch_specs(Index batch_size) const
Returns the tensor specs of the forward-pass scratch buffers used by the operator.
Index source_sequence_length
Definition operators.h:646
Index query_sequence_length
Definition operators.h:645
bool use_causal_mask
Definition operators.h:647
void destroy_cuda() override
Releases CUDA resources owned by the operator; called from destructors.
void set(Index heads_number, Index head_dimension, Index query_sequence_length, Index source_sequence_length, bool use_causal_mask, Type compute_dtype)
Configures the attention geometry and compute precision.
MatrixR causal_mask
Definition operators.h:650
void set_dropout_rate(float rate)
Sets the post-softmax dropout rate (0 disables dropout).
Definition operators.h:666
DropoutOp dropout
Definition operators.h:652
void apply(const TensorView &query, const TensorView &key, const TensorView &value, const TensorView &source_input, TensorView &attention_weights, TensorView &attention_weights_dropped, TensorView &output, float *mask_scratch, bool is_training)
Computes attention output from Q, K, V; applies softmax, mask and dropout in place.
void apply_delta(const TensorView &query, const TensorView &key, const TensorView &value, const TensorView &attention_output, const TensorView &attention_weights, const TensorView &attention_weights_dropped, const TensorView &output_delta, TensorView &attention_weight_delta, TensorView &query_delta, TensorView &key_delta, TensorView &value_delta) const
Computes Q/K/V gradients from the output gradient and cached forward activations.
vector< size_t > attention_output_slots
Definition operators.h:680
void from_JSON(const Json *parent) override
Restores the operator configuration from a JSON node.
Index head_dimension
Definition operators.h:644
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
Type compute_dtype
Definition operators.h:648
AttentionOp(AttentionOp &&) noexcept
void to_JSON(JsonWriter &w) const override
Serializes the operator configuration to a JSON writer.
Index heads_number
Definition operators.h:643
vector< size_t > scratch_slots
Definition operators.h:679
size_t source_view_index
Definition operators.h:677
Workspace holding parameter gradients and per-layer deltas during a backward pass.
Definition back_propagation.h:21
vector< vector< TensorView > > delta_views
Definition back_propagation.h:44
Batch normalization with learnable scale/shift and running statistics for inference.
Definition operators.h:308
void load_state_from_JSON(const Json *parent) override
Restores persistent state (e.g. running statistics) from a JSON node.
Index features
Definition operators.h:309
void set(Index new_features, float new_momentum=0.1f)
Configures the per-feature normalization.
void link_states(span< const TensorView > views) override
Binds state views provided by the hosting layer.
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
TensorView gamma
Definition operators.h:312
void init_defaults()
Resets gamma to one, beta to zero, and running stats to identity values.
void apply_delta(const TensorView &input, const TensorView &mean, const TensorView &inverse_variance, TensorView &delta) const
Computes the input gradient given the cached normalization statistics from forward.
void invalidate_inference_cache()
Marks the inference cache as stale so it is rebuilt on the next inference call.
Definition operators.h:362
vector< TensorSpec > state_specs() const override
Returns the tensor specs of persistent state owned by this operator.
float momentum
Definition operators.h:310
void from_JSON(const Json *parent) override
Restores the operator configuration from a JSON node.
TensorView beta_gradient
Definition operators.h:318
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void update_inference_cache()
Rebuilds the fused scale/shift cache used by the inference path from running stats.
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
Definition operators.h:335
void set_parameters_random() override
Initializes parameters with random values.
Definition operators.h:334
TensorView beta
Definition operators.h:313
TensorView gamma_gradient
Definition operators.h:317
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
TensorView running_mean
Definition operators.h:314
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
bool active() const
Returns true when the operator has been configured (features > 0).
Definition operators.h:321
void to_JSON(JsonWriter &w) const override
Serializes the operator configuration to a JSON writer.
TensorView running_variance
Definition operators.h:315
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Clamps each output channel to a configurable lower/upper interval.
Definition operators.h:1004
TensorView upper
Definition operators.h:1011
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
Method
Disables bounding or enables per-channel clamping.
Definition operators.h:1006
@ Bounding
Definition operators.h:1006
@ NoBounding
Definition operators.h:1006
Method method
Definition operators.h:1008
TensorView lower
Definition operators.h:1010
Owning raw byte buffer that lives on CPU or CUDA memory, with aligned (re)allocation.
Definition tensor_utilities.h:166
Affine combination output = input * weights + bias (the dense matmul building block).
Definition operators.h:232
TensorView bias_gradient
Definition operators.h:241
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
TensorView weights
Definition operators.h:237
void set_parameters_random() override
Initializes parameters with random values.
Index input_features
Definition operators.h:233
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
Type weight_type
Definition operators.h:235
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void apply(const TensorView &input, TensorView &output, cublasLtEpilogue_t epilogue=CUBLASLT_EPILOGUE_BIAS)
Computes output = input * weights + bias with an optional fused epilogue (ReLU, bias,...
TensorView weight_gradient
Definition operators.h:240
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
void apply_delta(const TensorView &output_delta, const TensorView &input, TensorView &input_delta, bool accumulate_input_delta=false) const
Computes input_delta from output_delta and updates weight/bias gradients.
TensorView bias
Definition operators.h:238
Index output_features
Definition operators.h:234
void set(Index new_input_features, Index new_output_features, Type new_weight_type=Type::FP32)
Configures input/output dimensions and the weight storage dtype.
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
Fused affine + ReLU activation (uses cuBLASLt epilogue on GPU when available).
Definition operators.h:286
void set(Index input_features, Index output_features, Type weight_type=Type::FP32)
Configures the underlying CombinationOp; ReLU is fixed.
ActivationOp activation
Definition operators.h:288
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
Definition operators.h:298
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
Definition operators.h:294
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
Definition operators.h:293
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void set_parameters_random() override
Initializes parameters with random values.
Definition operators.h:297
CombinationOp combination
Definition operators.h:287
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
Definition operators.h:295
2D convolution operator (NHWC layout) backed by Eigen on CPU and cuDNN on GPU.
Definition operators.h:397
TensorView bias
Definition operators.h:412
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
Index kernel_channels
Definition operators.h:404
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
TensorView weight_gradient
Definition operators.h:414
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
void set(Index input_h, Index input_w, Index kernels_n, Index kernel_h, Index kernel_w, Index kernel_c, Index row_stride, Index column_stride, Index padding_h, Index padding_w, Type compute_dtype)
Configures the convolution geometry and compute precision.
ConvolutionOp & operator=(const ConvolutionOp &)=delete
TensorView bias_gradient
Definition operators.h:415
Index input_width
Definition operators.h:399
~ConvolutionOp() override
Definition operators.h:461
Index padding_height
Definition operators.h:406
Index padding_width
Definition operators.h:407
ConvolutionOp(const ConvolutionOp &)=delete
void destroy_cuda() override
Releases CUDA resources owned by the operator; called from destructors.
Index kernel_width
Definition operators.h:403
Type compute_dtype
Definition operators.h:409
Index kernel_height
Definition operators.h:402
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
void set_parameters_random() override
Initializes parameters with random values.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
Index kernels_number
Definition operators.h:401
Index input_height
Definition operators.h:398
void apply_cpu(const TensorView &input, TensorView &output)
CPU forward path; im2col + GEMM.
void apply_delta(const TensorView &input, const TensorView &output_delta, TensorView &input_delta) const
Computes input_delta from output_delta and updates weight/bias gradients.
void apply_gpu(const TensorView &input, TensorView &output, cudnnActivationDescriptor_t fused_activation=nullptr)
GPU forward path; runs cuDNN convolution with an optional fused activation.
TensorView weights
Definition operators.h:411
void set(Index input_h, Index input_w, Index kernels_n, Index kernel_h, Index kernel_w, Index kernel_c, Index row_stride, Index column_stride, Index padding_h, Index padding_w, Type compute_dtype)
Configures the underlying ConvolutionOp; ReLU is fixed.
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
Definition operators.h:508
void set_parameters_random() override
Initializes parameters with random values.
Definition operators.h:512
~ConvolutionReluOp() override
Definition operators.h:516
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
Definition operators.h:510
ActivationOp activation
Definition operators.h:499
ConvolutionReluOp(const ConvolutionReluOp &)=delete
ConvolutionOp convolution
Definition operators.h:498
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
Definition operators.h:513
void destroy_cuda() override
Releases CUDA resources owned by the operator; called from destructors.
Definition operators.h:515
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
Definition operators.h:509
ConvolutionReluOp & operator=(const ConvolutionReluOp &)=delete
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Inverted dropout: at training time zeros activations with probability rate and rescales survivors.
Definition operators.h:122
bool active() const
Returns true when the dropout rate is non-zero.
Definition operators.h:130
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
vector< size_t > save_slots
Definition operators.h:127
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
DropoutOp(DropoutOp &&) noexcept=default
float rate
Definition operators.h:123
void apply_gpu(TensorView &output)
GPU forward implementation; samples the mask and rescales survivors in place.
void to_JSON(JsonWriter &w) const override
Serializes the operator configuration to a JSON writer.
~DropoutOp() override
Definition operators.h:154
Buffer mask
Definition operators.h:125
void set_rate(float new_rate)
Sets the drop probability (0 disables dropout).
void from_JSON(const Json *parent) override
Restores the operator configuration from a JSON node.
void apply_cpu(TensorView &output)
CPU forward implementation; samples the mask and rescales survivors in place.
void apply_delta(TensorView &delta) const
Applies the cached mask to a gradient tensor during the backward pass.
DropoutOp()=default
void destroy_cuda() override
Releases CUDA resources owned by the operator; called from destructors.
Token embedding lookup with optional scaling and additive positional encoding.
Definition operators.h:947
TensorView weights
Definition operators.h:957
bool add_positional_encoding
Definition operators.h:953
TensorView weight_gradient
Definition operators.h:960
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
void init_positional_encoding()
Fills the positional-encoding state tensor with the standard sinusoidal pattern.
bool scale_embedding
Definition operators.h:952
Index sequence_length
Definition operators.h:949
Index vocabulary_size
Definition operators.h:948
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
TensorView positional_encoding
Definition operators.h:958
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
float embedding_scale
Definition operators.h:955
vector< TensorSpec > state_specs() const override
Returns the tensor specs of persistent state owned by this operator.
void link_states(span< const TensorView > views) override
Binds state views provided by the hosting layer.
Index embedding_dimension
Definition operators.h:950
void set(Index new_vocabulary_size, Index new_sequence_length, Index new_embedding_dimension)
Configures the lookup table dimensions.
void set_parameters_random() override
Initializes parameters with random values.
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
Definition enum_map.h:18
Flattens a multi-dimensional tensor into a 2D (batch, features) tensor.
Definition operators.h:995
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Workspace holding the activations of every layer during a forward pass.
Definition forward_propagation.h:20
vector< vector< vector< TensorView > > > views
Definition forward_propagation.h:45
Layer normalization with learnable scale/shift, applied across the embedding dimension.
Definition operators.h:530
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
TensorView gamma_gradient
Definition operators.h:537
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
TensorView gamma
Definition operators.h:534
void set(Index sequence_length, Index embedding_dimension)
Configures the operator for a (sequence_length, embedding_dimension) input.
Index sequence_length
Definition operators.h:531
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
Definition operators.h:548
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Index embedding_dimension
Definition operators.h:532
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
TensorView beta
Definition operators.h:535
TensorView beta_gradient
Definition operators.h:538
void init_defaults()
Resets gamma to one and beta to zero.
void set_parameters_random() override
Initializes parameters with random values.
Definition operators.h:547
Reshapes (batch, heads, seq, head_dim) tensors back into (batch, seq, embed); no parameters.
Definition operators.h:839
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
Index head_dimension
Definition operators.h:842
Index query_sequence_length
Definition operators.h:841
Type compute_dtype
Definition operators.h:843
void set(Index heads_number, Index query_sequence_length, Index head_dimension, Type compute_dtype)
Configures the merge geometry.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Index heads_number
Definition operators.h:840
Projects (input_features) into (heads * head_dim) and reshapes for multi-head attention.
Definition operators.h:579
bool accumulate_input_delta_cross
Definition operators.h:600
Index head_dimension
Definition operators.h:583
CombinationOp combination
Definition operators.h:580
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Index input_features
Definition operators.h:581
vector< size_t > input_delta_slots_cross
Definition operators.h:598
vector< size_t > scratch_slots
Definition operators.h:592
vector< size_t > input_delta_slots_self
Definition operators.h:597
Type compute_dtype
Definition operators.h:584
bool accumulate_input_delta_self
Definition operators.h:599
void apply_delta(const TensorView &head_delta, const TensorView &input, TensorView &input_delta, bool accumulate, float *scratch) const
Computes input_delta from per-head gradients and updates the projection weight gradient.
void set_parameters_random() override
Initializes parameters with random values.
Definition operators.h:613
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void apply(const TensorView &input, TensorView &head_output, float *scratch)
Projects input and reshapes the result into per-head form in head_output.
void link_parameters(span< const TensorView > views) override
Binds parameter views provided by the hosting layer.
Definition operators.h:610
void set_parameters_glorot() override
Initializes parameters using Glorot (Xavier) initialization.
Definition operators.h:614
vector< TensorSpec > parameter_specs() const override
Returns the tensor specs of trainable parameters owned by this operator.
Definition operators.h:609
void link_gradients(span< const TensorView > views) override
Binds gradient views provided by the hosting layer.
Definition operators.h:611
Index heads_number
Definition operators.h:582
void set(Index input_features, Index heads_number, Index head_dimension, Type compute_dtype)
Configures the projection geometry.
size_t input_view_index
Definition operators.h:589
Base class for compute building blocks composed by layers (matmul, activation, dropout,...
Definition operators.h:28
virtual void destroy_cuda()
Releases CUDA resources owned by the operator; called from destructors.
Definition operators.h:74
virtual vector< TensorSpec > parameter_specs() const
Returns the tensor specs of trainable parameters owned by this operator.
Definition operators.h:32
TensorView & get_input(ForwardPropagation &fp, size_t layer, size_t i=0) const noexcept
Definition operators.h:82
virtual void link_parameters(span< const TensorView >)
Binds parameter views provided by the hosting layer.
Definition operators.h:38
TensorView & get_output(ForwardPropagation &fp, size_t layer, size_t i=0) const noexcept
Definition operators.h:92
virtual void back_propagate(ForwardPropagation &, BackPropagation &, size_t) const noexcept
Runs the operator's backward computation, accumulating into gradient/delta buffers.
Definition operators.h:62
virtual void from_JSON(const Json *)
Restores the operator configuration from a JSON node.
Definition operators.h:68
virtual void to_JSON(JsonWriter &) const
Serializes the operator configuration to a JSON writer.
Definition operators.h:65
virtual void link_gradients(span< const TensorView >)
Binds gradient views provided by the hosting layer.
Definition operators.h:41
virtual void load_state_from_JSON(const Json *)
Restores persistent state (e.g. running statistics) from a JSON node.
Definition operators.h:71
TensorView & get_output_delta(BackPropagation &bp, size_t layer, size_t i=0) const noexcept
Definition operators.h:97
vector< TensorView > & get_inputs(ForwardPropagation &fp, size_t layer, size_t i=0) const noexcept
Definition operators.h:87
vector< size_t > output_slots
Definition operators.h:77
TensorView & get_input_delta(BackPropagation &bp, size_t layer, size_t i=0) const noexcept
Definition operators.h:102
vector< size_t > output_delta_slots
Definition operators.h:80
vector< size_t > input_delta_slots
Definition operators.h:79
virtual void link_states(span< const TensorView >)
Binds state views provided by the hosting layer.
Definition operators.h:44
virtual void forward_propagate(ForwardPropagation &, size_t, bool) noexcept
Runs the operator's forward computation.
Definition operators.h:56
virtual void set_parameters_glorot()
Initializes parameters using Glorot (Xavier) initialization.
Definition operators.h:50
virtual ~Operator()=default
vector< size_t > input_slots
Definition operators.h:76
virtual void set_parameters_random()
Initializes parameters with random values.
Definition operators.h:47
virtual vector< TensorSpec > state_specs() const
Returns the tensor specs of persistent state owned by this operator.
Definition operators.h:35
Sequence-wide 1D pooling over the embedding dimension (mean or max).
Definition operators.h:930
Method method
Definition operators.h:933
Method
Supported pooling reductions.
Definition operators.h:932
@ Max
Definition operators.h:932
@ Average
Definition operators.h:932
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
void set(Index input_h, Index input_w, Index input_c, Index pool_h, Index pool_w, Index row_stride, Index column_stride, Index padding_h, Index padding_w, Method method)
Configures the pooling geometry.
void back_propagate(ForwardPropagation &fp, BackPropagation &bp, size_t layer) const noexcept override
Runs the operator's backward computation, accumulating into gradient/delta buffers.
PoolOp(const PoolOp &)=delete
Method
Supported pooling reductions.
Definition operators.h:862
@ Average
Definition operators.h:862
@ Max
Definition operators.h:862
Index padding_height
Definition operators.h:872
Index input_channels
Definition operators.h:866
~PoolOp() override
Definition operators.h:900
Index row_stride
Definition operators.h:870
Index pool_height
Definition operators.h:868
Index column_stride
Definition operators.h:871
Index pool_width
Definition operators.h:869
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
Method method
Definition operators.h:875
PoolOp & operator=(const PoolOp &)=delete
Index input_height
Definition operators.h:864
PoolOp()=default
Index input_width
Definition operators.h:865
Index padding_width
Definition operators.h:873
void destroy_cuda() override
Releases CUDA resources owned by the operator; called from destructors.
Scales inputs to a target range using per-feature minimum/maximum or mean/std statistics.
Definition operators.h:1019
TensorView minimums
Definition operators.h:1023
float min_range
Definition operators.h:1020
TensorView means
Definition operators.h:1025
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.
TensorView maximums
Definition operators.h:1024
float max_range
Definition operators.h:1021
TensorView scalers
Definition operators.h:1027
TensorView standard_deviations
Definition operators.h:1026
Non-owning view over a tensor: pointer, shape, and data type with rich reshape helpers.
Definition tensor_utilities.h:293
Inverse of ScaleOp: maps normalized outputs back to the original feature range.
Definition operators.h:1035
TensorView scalers
Definition operators.h:1043
float max_range
Definition operators.h:1037
TensorView means
Definition operators.h:1041
TensorView maximums
Definition operators.h:1040
TensorView minimums
Definition operators.h:1039
TensorView standard_deviations
Definition operators.h:1042
float min_range
Definition operators.h:1036
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Runs the operator's forward computation.