OpenNN
Open-source neural networks library
Loading...
Searching...
No Matches
operators.h
Go to the documentation of this file.
1// OpenNN: Open Neural Networks Library
2// www.opennn.net
3//
4// O P E R A T O R S H E A D E R
5//
6// Artificial Intelligence Techniques SL
7// artelnics@artelnics.com
8
9#pragma once
10
11#include "tensor_utilities.h"
12#include "enum_map.h"
13
14namespace opennn
15{
16
17class Json;
18class JsonWriter;
19
20#ifdef OPENNN_HAS_CUDA
21struct LtMatmulPlan;
22#endif
23
25
27{
28 virtual ~Operator() = default;
29
30 virtual vector<pair<Shape, Type>> parameter_specs() const { return {}; }
31 virtual vector<pair<Shape, Type>> state_specs() const { return {}; }
32
33 virtual void link_parameters(const vector<TensorView>&) {}
34 virtual void link_gradients (const vector<TensorView>&) {}
35 virtual void link_states (const vector<TensorView>&) {}
36
37 virtual void set_parameters_random() {}
38 virtual void set_parameters_glorot() {}
39
40 virtual void forward_propagate(ForwardPropagation&, size_t, bool) noexcept {}
41
42 virtual void to_JSON (JsonWriter&) const {}
43 virtual void from_JSON(const Json*) {}
44 virtual void load_state_from_JSON(const Json*) {}
45
46 virtual void destroy_cuda() {}
47
48 vector<size_t> input_slots;
49 vector<size_t> output_slots;
50};
51
52struct Add : Operator
53{
54 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
55
56private:
57 void apply_cpu(const vector<TensorView>& inputs, TensorView& output);
58#ifdef OPENNN_HAS_CUDA
59 void apply_gpu(const vector<TensorView>& inputs, TensorView& output);
60#endif
61 void check(const vector<TensorView>& inputs, const TensorView& output) const;
62};
63
65{
66 float rate = 0.0f;
67
69
71
72 // Optional: layer needs the pre-dropout values preserved for backward
73 // (activation derivative). When empty, no copy is performed.
74 vector<size_t> save_slots;
75
76 bool active() const { return rate > 0.0f; }
77
78 void set_rate(float new_rate);
79
80 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
81
82 void apply(TensorView& output);
83 void apply_delta(TensorView& delta) const;
84
85 void to_JSON(JsonWriter& w) const override;
86 void from_JSON(const Json* parent) override;
87
88 void destroy_cuda() override;
89
90 ~Dropout() override { destroy_cuda(); }
91
92 Dropout() = default;
93 Dropout(Dropout&&) noexcept = default;
94 Dropout& operator=(Dropout&&) noexcept = default;
95
96private:
97 void apply_cpu(TensorView& output);
98 void apply_gpu(TensorView& output);
99
100 void apply_delta_cpu(TensorView& delta) const;
101 void apply_delta_gpu(TensorView& delta) const;
102
103 void ensure_mask(Index n);
104};
105
107{
109
110 static const EnumMap<Function>& map();
111 static Function from_string(const string& name);
112 static const string& to_string(Function function);
114
116
118
119 void set_function(Function new_function);
120 void set_function(const string& name);
121
122 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
123
124 void apply(TensorView& output);
125 void apply_delta(const TensorView& outputs, TensorView& delta) const;
126
127 void to_JSON(JsonWriter& w) const override;
128 void from_JSON(const Json* parent) override;
129
130 void destroy_cuda() override;
131
132 ~Activation() override { destroy_cuda(); }
133
134 Activation() = default;
135 Activation(const Activation&) = delete;
136 Activation& operator=(const Activation&) = delete;
137
138private:
139 void apply_cpu(TensorView& output);
140 void apply_gpu(TensorView& output);
141
142 void apply_delta_cpu(const TensorView& outputs, TensorView& delta) const;
143 void apply_delta_gpu(const TensorView& outputs, TensorView& delta) const;
144};
145
147{
148 Index input_features = 0;
151
153
156
159
160 void set(Index new_input_features, Index new_output_features, Type new_weight_type = Type::FP32);
161
162 vector<pair<Shape, Type>> parameter_specs() const override;
163 void link_parameters (const vector<TensorView>& views) override;
164 void link_gradients(const vector<TensorView>& views) override;
165
166 void set_parameters_random() override;
167 void set_parameters_glorot() override;
168
169 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
170
172
173 void apply_delta(const TensorView& output_delta,
174 const TensorView& input,
175 TensorView& input_delta,
176 bool accumulate_input_delta = false) const;
177
178private:
179 void apply_cpu(const TensorView& input, TensorView& output, cublasLtEpilogue_t epilogue);
180 void apply_gpu(const TensorView& input, TensorView& output, cublasLtEpilogue_t epilogue);
181
182 void apply_delta_cpu(const TensorView& output_delta, const TensorView& input,
183 TensorView& input_delta, bool accumulate_input_delta) const;
184 void apply_delta_gpu(const TensorView& output_delta, const TensorView& input,
185 TensorView& input_delta, bool accumulate_input_delta) const;
186
187#ifdef OPENNN_HAS_CUDA
188 mutable const LtMatmulPlan* fwd_plan_ = nullptr;
189 mutable int fwd_total_rows_ = -1;
190 mutable cublasLtEpilogue_t fwd_epilogue_ = CUBLASLT_EPILOGUE_DEFAULT;
191
192 mutable const LtMatmulPlan* bwd_plan_ = nullptr;
193 mutable int bwd_total_rows_ = -1;
194 mutable int bwd_io_dtype_ = -1;
195#endif
196};
197
199{
200 Index features = 0;
201 float momentum = 0.1f;
202
207
210
211 bool active() const { return features > 0; }
212
213 void set(Index new_features, float new_momentum = 0.1f);
214
215 vector<pair<Shape, Type>> parameter_specs() const override;
216 vector<pair<Shape, Type>> state_specs() const override;
217 void link_parameters (const vector<TensorView>& views) override;
218 void link_gradients(const vector<TensorView>& views) override;
219 void link_states (const vector<TensorView>& views) override;
220
223
225
226 // Slot convention (set by hosting layer):
227 // input_slots = {input}
228 // output_slots = {output, mean, inverse_variance}
229 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
230
231 void apply_inference(const TensorView& input, TensorView& output);
232 void apply_training (const TensorView& input,
233 TensorView& mean, TensorView& inverse_variance,
234 TensorView& output);
235 void apply_delta (const TensorView& input,
236 const TensorView& mean,
237 const TensorView& inverse_variance,
238 TensorView& delta) const;
239
241 void invalidate_inference_cache() { inference_cache_dirty = true; }
242
243 void to_JSON(JsonWriter& w) const override;
244 void from_JSON(const Json* parent) override;
245 void load_state_from_JSON(const Json* parent) override;
246
247private:
248 VectorR inference_scale;
249 VectorR inference_shift;
250 bool inference_cache_dirty = true;
251
252 mutable VectorR delta_scale_scratch;
253
254 void apply_inference_cpu(const TensorView& input, TensorView& output);
255 void apply_inference_gpu(const TensorView& input, TensorView& output);
256
257 void apply_training_cpu (const TensorView& input,
258 TensorView& mean, TensorView& inverse_variance,
259 TensorView& output);
260 void apply_training_gpu (const TensorView& input,
261 TensorView& mean, TensorView& inverse_variance,
262 TensorView& output);
263
264 void apply_delta_cpu(const TensorView& input,
265 const TensorView& mean,
266 const TensorView& inverse_variance,
267 TensorView& delta) const;
268 void apply_delta_gpu(const TensorView& input,
269 const TensorView& mean,
270 const TensorView& inverse_variance,
271 TensorView& delta) const;
272};
273
275{
276 Index input_height = 0;
277 Index input_width = 0;
278
279 Index kernels_number = 0;
280 Index kernel_height = 0;
281 Index kernel_width = 0;
283
284 Index padding_height = 0;
285 Index padding_width = 0;
286
288
289 // Set by hosting layer to fuse ReLU into the cuDNN forward call (GPU only).
291
294
297
298#ifdef OPENNN_HAS_CUDA
299 cudnnFilterDescriptor_t kernel_descriptor = nullptr;
300 cudnnConvolutionDescriptor_t convolution_descriptor = nullptr;
301
305
306 Buffer workspace{Device::CUDA};
307 Buffer backward_filter_workspace{Device::CUDA};
308
309 // High-water-mark of the batch size for which the cuDNN plan
310 // (algorithms + workspaces) is currently valid. Lazy-initialized on the
311 // first apply_gpu and re-tuned only if a larger batch arrives (e.g. test
312 // batch larger than training).
313 Index planned_batch_size = 0;
314#endif
315
316 void set(Index input_h, Index input_w,
317 Index kernels_n, Index kernel_h, Index kernel_w, Index kernel_c,
318 Index row_stride, Index column_stride,
319 Index padding_h, Index padding_w,
321
322 vector<pair<Shape, Type>> parameter_specs() const override;
323 void link_parameters (const vector<TensorView>& views) override;
324 void link_gradients(const vector<TensorView>& views) override;
325
326 void set_parameters_random() override;
327 void set_parameters_glorot() override;
328
329 void destroy_cuda() override;
330
331 ~Convolution() override { destroy_cuda(); }
332
333 Convolution() = default;
334 Convolution(const Convolution&) = delete;
336
337 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
338
339 void apply(const TensorView& input,
340 TensorView& output,
342
343 void apply_delta(const TensorView& input,
344 const TensorView& output_delta,
345 TensorView& input_delta) const;
346
347private:
348 void apply_cpu(const TensorView& input, TensorView& output);
349 void apply_gpu(const TensorView& input, TensorView& output, cudnnActivationDescriptor_t fused_activation);
350
351 void apply_delta_cpu(const TensorView& input, const TensorView& output_delta,
352 TensorView& input_delta) const;
353 void apply_delta_gpu(const TensorView& input, const TensorView& output_delta,
354 TensorView& input_delta) const;
355
356#ifdef OPENNN_HAS_CUDA
357 void plan_convolution_algorithms(const TensorView& input, const TensorView& output);
358#endif
359};
360
362{
365
368
371
373
374 vector<pair<Shape, Type>> parameter_specs() const override;
375 void link_parameters (const vector<TensorView>& views) override;
376 void link_gradients(const vector<TensorView>& views) override;
377
380
382
383 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
384
385 void apply(const TensorView& input,
386 TensorView& means, TensorView& standard_deviations, TensorView& normalized,
387 TensorView& output);
388
389 void apply_delta(const TensorView& input,
390 const TensorView& output_delta,
391 const TensorView& means, const TensorView& standard_deviations,
392 const TensorView& normalized,
393 TensorView& input_delta) const;
394
395private:
396 void apply_cpu(const TensorView& input,
397 TensorView& means, TensorView& standard_deviations, TensorView& normalized,
398 TensorView& output);
399
400 void apply_gpu(const TensorView& input,
401 TensorView& means, TensorView& standard_deviations,
402 TensorView& output);
403
404 void apply_delta_cpu(const TensorView& output_delta,
405 const TensorView& standard_deviations,
406 const TensorView& normalized,
407 TensorView& input_delta) const;
408 void apply_delta_gpu(const TensorView& input,
409 const TensorView& output_delta,
410 const TensorView& means, const TensorView& standard_deviations,
411 TensorView& input_delta) const;
412};
413
415{
417 Index input_features = 0;
418 Index heads_number = 0;
419 Index head_dimension = 0;
421
423
424 vector<pair<Shape, Type>> parameter_specs() const override { return combination.parameter_specs(); }
425 void link_parameters (const vector<TensorView>& views) override { combination.link_parameters(views); }
426 void link_gradients(const vector<TensorView>& views) override { combination.link_gradients(views); }
427
428 void set_parameters_random() override { combination.set_parameters_random(); }
429 void set_parameters_glorot() override { combination.set_parameters_glorot(); }
430
431 // input: {batch, seq_len, input_features}
432 // head_output: {batch, heads_number, seq_len, head_dimension}
433 // scratch: batch * seq_len * input_features floats (caller-owned)
434 void apply(const TensorView& input, TensorView& head_output, float* scratch);
435
436 void apply_delta(const TensorView& head_gradient,
437 const TensorView& input,
438 TensorView& input_gradient,
439 bool accumulate,
440 float* scratch) const;
441};
442
444{
445 Index heads_number = 0;
446 Index head_dimension = 0;
449 bool use_causal_mask = false;
451
453
455
456 void set(Index heads_number, Index head_dimension,
459
460 void set_dropout_rate(float rate) { dropout.set_rate(rate); }
461
462 vector<pair<Shape, Type>> forward_scratch_specs(Index batch_size) const;
463
464 void apply(const TensorView& query, // {B, H, Q_seq, D}
465 const TensorView& key, // {B, H, S_seq, D}
466 const TensorView& value, // {B, H, S_seq, D}
467 const TensorView& source_input, // {B, S_seq, embed} for padding mask
468 TensorView& attention_weights, // {B, H, Q_seq, S_seq} on CPU; empty on GPU
469 TensorView& attention_weights_dropped, // CPU-only, optional
470 TensorView& output, // {B, H, Q_seq, D}
471 float* mask_scratch,
472 bool is_training);
473
474 void apply_delta(const TensorView& query,
475 const TensorView& key,
476 const TensorView& value,
477 const TensorView& attention_output, // forward output O — only read by GPU SDPA
478 const TensorView& attention_weights,
479 const TensorView& attention_weights_dropped,
480 const TensorView& output_gradient, // {B, H, Q_seq, D}
481 TensorView& attention_weight_gradient, // CPU-only scratch; empty on GPU
482 TensorView& query_gradient,
483 TensorView& key_gradient,
484 TensorView& value_gradient) const;
485
486 void to_JSON(JsonWriter& w) const override;
487 void from_JSON(const Json* parent) override;
488
489 void destroy_cuda() override;
490
492 ~Attention() override;
493 Attention(Attention&&) noexcept;
494 Attention& operator=(Attention&&) noexcept;
495 Attention(const Attention&) = delete;
496 Attention& operator=(const Attention&) = delete;
497
498 struct SDPACache;
499
500private:
501 float scaling_factor() const;
502
503 void apply_cpu(const TensorView& query,
504 const TensorView& key,
505 const TensorView& value,
506 const TensorView& source_input,
507 TensorView& attention_weights,
508 TensorView& attention_weights_dropped,
509 TensorView& output,
510 float* mask_scratch,
511 bool is_training);
512
513 void apply_gpu(const TensorView& query,
514 const TensorView& key,
515 const TensorView& value,
516 const TensorView& source_input,
517 TensorView& attention_weights,
518 TensorView& attention_weights_dropped,
519 TensorView& output,
520 float* mask_scratch,
521 bool is_training);
522
523 // Same signature as apply_delta(); CPU path ignores attention_output.
524 void apply_delta_cpu(const TensorView& query,
525 const TensorView& key,
526 const TensorView& value,
527 const TensorView& attention_output,
528 const TensorView& attention_weights,
529 const TensorView& attention_weights_dropped,
530 const TensorView& output_gradient,
531 TensorView& attention_weight_gradient,
532 TensorView& query_gradient,
533 TensorView& key_gradient,
534 TensorView& value_gradient) const;
535
536 void apply_delta_gpu(const TensorView& query,
537 const TensorView& key,
538 const TensorView& value,
539 const TensorView& attention_output,
540 const TensorView& attention_weights,
541 const TensorView& attention_weights_dropped,
542 const TensorView& output_gradient,
543 TensorView& attention_weight_gradient,
544 TensorView& query_gradient,
545 TensorView& key_gradient,
546 TensorView& value_gradient) const;
547
548#ifdef OPENNN_HAS_CUDA
549 // Unfused GPU backward — used when SDPA isn't supported for the shape/dtype.
550 // Same op sequence as apply_delta_cpu but with cuDNN softmax backward.
551 void apply_delta_gpu_unfused(const TensorView& query,
552 const TensorView& key,
553 const TensorView& value,
554 const TensorView& attention_weights,
555 const TensorView& attention_weights_dropped,
556 const TensorView& output_gradient,
557 TensorView& attention_weight_gradient,
558 TensorView& query_gradient,
559 TensorView& key_gradient,
560 TensorView& value_gradient) const;
561#endif
562
563 mutable std::unique_ptr<SDPACache> sdpa_cache;
564};
565
567{
568 Index input_height = 0;
569 Index input_width = 0;
570 Index input_channels = 0;
571
572 Index pool_height = 1;
573 Index pool_width = 1;
574 Index row_stride = 1;
575 Index column_stride = 1;
576 Index padding_height = 0;
577 Index padding_width = 0;
578
579 int method = 0; // 0 = Max, 1 = Average (avoids forward-decl-only enum at this point)
580
581#ifdef OPENNN_HAS_CUDA
582 cudnnPoolingDescriptor_t pooling_descriptor = nullptr;
583#endif
584
585 void set(Index input_h, Index input_w, Index input_c,
586 Index pool_h, Index pool_w,
587 Index row_stride, Index column_stride,
588 Index padding_h, Index padding_w,
589 int method);
590
591 void destroy_cuda() override;
592
593 ~Pool() override { destroy_cuda(); }
594
595 Pool() = default;
596 Pool(const Pool&) = delete;
597 Pool& operator=(const Pool&) = delete;
598
599 // Slot convention (set by Pooling layer in update_pool_operator):
600 // input_slots = {Input}
601 // output_slots = {Output, MaximalIndices} for MaxPooling
602 // output_slots = {Output} for AveragePooling
603 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
604
605 void apply(const TensorView& input, TensorView& output, TensorView& maximal_indices, bool is_training);
606 void apply_delta(const TensorView& input,
607 const TensorView& output,
608 const TensorView& output_delta,
610 TensorView& input_delta) const;
611
612private:
613 void apply_cpu(const TensorView& input, TensorView& output, TensorView& maximal_indices, bool is_training);
614 void apply_gpu(const TensorView& input, TensorView& output);
615
616 void apply_delta_cpu(const TensorView& output_delta,
618 TensorView& input_delta) const;
619 void apply_delta_gpu(const TensorView& input,
620 const TensorView& output,
621 const TensorView& output_delta,
622 TensorView& input_delta) const;
623};
624
626{
627 int method = 0; // 0 = Max, 1 = Average
628
629 // Slot convention (set by Pooling3d layer):
630 // input_slots = {Input}
631 // output_slots = {Output, MaximalIndices}
632 // For AveragePooling, MaximalIndices is allocated empty (Shape{}).
633 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
634};
635
637{
641
642 bool scale_embedding = false;
644
645 float embedding_scale = 1.0f;
646
649
651
652 void set(Index new_vocabulary_size, Index new_sequence_length, Index new_embedding_dimension);
653
654 vector<pair<Shape, Type>> parameter_specs() const override;
655 vector<pair<Shape, Type>> state_specs() const override;
656 void link_parameters (const vector<TensorView>& views) override;
657 void link_gradients(const vector<TensorView>& views) override;
658 void link_states (const vector<TensorView>& views) override;
659
660 void set_parameters_random() override;
661 void set_parameters_glorot() override;
662
664
665 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
666
667 void apply(const TensorView& indices, TensorView& output);
668 void apply_delta(const TensorView& indices,
669 const TensorView& output_delta) const;
670
671private:
672 void apply_cpu(const TensorView& indices, TensorView& output);
673 void apply_gpu(const TensorView& indices, TensorView& output);
674
675 void apply_delta_cpu(const TensorView& indices, const TensorView& output_delta) const;
676 void apply_delta_gpu(const TensorView& indices, const TensorView& output_delta) const;
677};
678
680{
681 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
682};
683
685{
686 enum class Method { NoBounding, Bounding };
687
689 Index features = 0;
690
693
694 void set(Method new_method, Index new_features);
695
696 vector<pair<Shape, Type>> state_specs() const override;
697 void link_states(const vector<TensorView>& views) override;
698
699 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
700
701 void load_state_from_JSON(const Json* parent) override;
702};
703
705{
706 Index features = 0;
707 float min_range = -1.0f;
708 float max_range = 1.0f;
709
715
716 void set(Index new_features);
717
718 vector<pair<Shape, Type>> state_specs() const override;
719 void link_states(const vector<TensorView>& views) override;
720
721 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
722
723 void load_state_from_JSON(const Json* parent) override;
724};
725
727{
728 Index features = 0;
729 float min_range = -1.0f;
730 float max_range = 1.0f;
731
737
738 void set(Index new_features);
739
740 vector<pair<Shape, Type>> state_specs() const override;
741 void link_states(const vector<TensorView>& views) override;
742
743 void forward_propagate(ForwardPropagation& fp, size_t layer, bool is_training) noexcept override;
744
745 void load_state_from_JSON(const Json* parent) override;
746};
747
748}
749
750// OpenNN: Open Neural Networks Library.
751// Copyright(C) 2005-2026 Artificial Intelligence Techniques, SL.
752// Licensed under the GNU Lesser General Public License v2.1 or later.
Definition json.h:84
Definition json.h:22
Definition adaptive_moment_estimation.h:19
cublasLtEpilogue_t
Definition neural_network.h:85
@ CUBLASLT_EPILOGUE_DEFAULT
Definition neural_network.h:85
@ CUBLASLT_EPILOGUE_BIAS
Definition neural_network.h:85
float mean(const VectorR &)
Matrix< float, Dynamic, Dynamic, Layout > MatrixR
Definition neural_network.h:152
Matrix< float, Dynamic, 1 > VectorR
Definition neural_network.h:156
@ CUDA
Definition configuration.h:16
VectorI maximal_indices(const VectorR &, Index)
cudnnActivationMode_t
Definition neural_network.h:89
Type
Definition configuration.h:18
@ FP32
Definition configuration.h:18
cudnnConvolutionBwdFilterAlgo_t
Definition neural_network.h:94
@ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0
Definition neural_network.h:94
cudnnConvolutionBwdDataAlgo_t
Definition neural_network.h:93
@ CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
Definition neural_network.h:93
cudnnConvolutionFwdAlgo_t
Definition neural_network.h:92
@ CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM
Definition neural_network.h:92
void * cudnnActivationDescriptor_t
Definition neural_network.h:102
void * cudnnPoolingDescriptor_t
Definition neural_network.h:101
void * cudnnConvolutionDescriptor_t
Definition neural_network.h:100
void * cudnnFilterDescriptor_t
Definition neural_network.h:99
static Function from_string(const string &name)
void apply_delta(const TensorView &outputs, TensorView &delta) const
Function
Definition operators.h:108
@ Sigmoid
Definition operators.h:108
@ Softmax
Definition operators.h:108
@ Identity
Definition operators.h:108
@ Tanh
Definition operators.h:108
@ ReLU
Definition operators.h:108
cudnnActivationDescriptor_t descriptor
Definition operators.h:117
~Activation() override
Definition operators.h:132
void destroy_cuda() override
static const string & to_string(Function function)
Function function
Definition operators.h:115
static const EnumMap< Function > & map()
void from_JSON(const Json *parent) override
Activation & operator=(const Activation &)=delete
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
void to_JSON(JsonWriter &w) const override
void set_function(const string &name)
void set_function(Function new_function)
void apply(TensorView &output)
Activation()=default
static cudnnActivationMode_t to_cudnn_mode(Function function)
Activation(const Activation &)=delete
Definition operators.h:53
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Index heads_number
Definition operators.h:445
void apply_delta(const TensorView &query, const TensorView &key, const TensorView &value, const TensorView &attention_output, const TensorView &attention_weights, const TensorView &attention_weights_dropped, const TensorView &output_gradient, TensorView &attention_weight_gradient, TensorView &query_gradient, TensorView &key_gradient, TensorView &value_gradient) const
Attention(Attention &&) noexcept
Type compute_dtype
Definition operators.h:450
~Attention() override
vector< pair< Shape, Type > > forward_scratch_specs(Index batch_size) const
void destroy_cuda() override
void set_dropout_rate(float rate)
Definition operators.h:460
MatrixR causal_mask
Definition operators.h:452
void apply(const TensorView &query, const TensorView &key, const TensorView &value, const TensorView &source_input, TensorView &attention_weights, TensorView &attention_weights_dropped, TensorView &output, float *mask_scratch, bool is_training)
void to_JSON(JsonWriter &w) const override
void set(Index heads_number, Index head_dimension, Index query_sequence_length, Index source_sequence_length, bool use_causal_mask, Type compute_dtype)
bool use_causal_mask
Definition operators.h:449
void from_JSON(const Json *parent) override
Index query_sequence_length
Definition operators.h:447
Index head_dimension
Definition operators.h:446
Dropout dropout
Definition operators.h:454
Index source_sequence_length
Definition operators.h:448
Definition operators.h:199
bool active() const
Definition operators.h:211
TensorView gamma_gradient
Definition operators.h:208
void apply_training(const TensorView &input, TensorView &mean, TensorView &inverse_variance, TensorView &output)
void invalidate_inference_cache()
Definition operators.h:241
void apply_delta(const TensorView &input, const TensorView &mean, const TensorView &inverse_variance, TensorView &delta) const
TensorView gamma
Definition operators.h:203
void link_gradients(const vector< TensorView > &views) override
TensorView beta
Definition operators.h:204
vector< pair< Shape, Type > > parameter_specs() const override
TensorView running_variance
Definition operators.h:206
void apply_inference(const TensorView &input, TensorView &output)
void link_states(const vector< TensorView > &views) override
void link_parameters(const vector< TensorView > &views) override
vector< pair< Shape, Type > > state_specs() const override
void set(Index new_features, float new_momentum=0.1f)
TensorView running_mean
Definition operators.h:205
Index features
Definition operators.h:200
void load_state_from_JSON(const Json *parent) override
float momentum
Definition operators.h:201
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
void set_parameters_random() override
Definition operators.h:221
void set_parameters_glorot() override
Definition operators.h:222
void update_inference_cache()
void from_JSON(const Json *parent) override
void to_JSON(JsonWriter &w) const override
TensorView beta_gradient
Definition operators.h:209
Definition operators.h:685
Index features
Definition operators.h:689
vector< pair< Shape, Type > > state_specs() const override
Method method
Definition operators.h:688
Method
Definition operators.h:686
@ Bounding
Definition operators.h:686
@ NoBounding
Definition operators.h:686
void load_state_from_JSON(const Json *parent) override
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
TensorView upper
Definition operators.h:692
TensorView lower
Definition operators.h:691
void link_states(const vector< TensorView > &views) override
void set(Method new_method, Index new_features)
Definition tensor_utilities.h:144
Definition operators.h:147
vector< pair< Shape, Type > > parameter_specs() const override
TensorView weight_gradient
Definition operators.h:157
cublasLtEpilogue_t epilogue
Definition operators.h:152
void set(Index new_input_features, Index new_output_features, Type new_weight_type=Type::FP32)
void link_parameters(const vector< TensorView > &views) override
TensorView weights
Definition operators.h:154
TensorView bias
Definition operators.h:155
Type weight_type
Definition operators.h:150
void apply_delta(const TensorView &output_delta, const TensorView &input, TensorView &input_delta, bool accumulate_input_delta=false) const
void set_parameters_random() override
Index output_features
Definition operators.h:149
void apply(const TensorView &input, TensorView &output, cublasLtEpilogue_t epilogue=CUBLASLT_EPILOGUE_BIAS)
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
TensorView bias_gradient
Definition operators.h:158
void link_gradients(const vector< TensorView > &views) override
Index input_features
Definition operators.h:148
void set_parameters_glorot() override
Convolution(const Convolution &)=delete
Index kernel_channels
Definition operators.h:282
void link_gradients(const vector< TensorView > &views) override
Index padding_width
Definition operators.h:285
void apply_delta(const TensorView &input, const TensorView &output_delta, TensorView &input_delta) const
void set_parameters_random() override
Index input_height
Definition operators.h:276
TensorView weight_gradient
Definition operators.h:295
void destroy_cuda() override
Index padding_height
Definition operators.h:284
vector< pair< Shape, Type > > parameter_specs() const override
void set(Index input_h, Index input_w, Index kernels_n, Index kernel_h, Index kernel_w, Index kernel_c, Index row_stride, Index column_stride, Index padding_h, Index padding_w, Type compute_dtype)
Index kernel_width
Definition operators.h:281
TensorView bias
Definition operators.h:293
Type compute_dtype
Definition operators.h:287
TensorView weights
Definition operators.h:292
void apply(const TensorView &input, TensorView &output, cudnnActivationDescriptor_t fused_activation=nullptr)
Index kernel_height
Definition operators.h:280
~Convolution() override
Definition operators.h:331
void link_parameters(const vector< TensorView > &views) override
cudnnActivationDescriptor_t fused_activation
Definition operators.h:290
Convolution & operator=(const Convolution &)=delete
Index kernels_number
Definition operators.h:279
Index input_width
Definition operators.h:277
void set_parameters_glorot() override
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
TensorView bias_gradient
Definition operators.h:296
Definition operators.h:65
vector< size_t > save_slots
Definition operators.h:74
void destroy_cuda() override
void from_JSON(const Json *parent) override
float rate
Definition operators.h:66
void apply(TensorView &output)
Buffer mask
Definition operators.h:70
~Dropout() override
Definition operators.h:90
void to_JSON(JsonWriter &w) const override
Dropout()=default
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Dropout(Dropout &&) noexcept=default
VectorR mask_cpu
Definition operators.h:68
void apply_delta(TensorView &delta) const
bool active() const
Definition operators.h:76
void set_rate(float new_rate)
Definition operators.h:637
void link_parameters(const vector< TensorView > &views) override
void set(Index new_vocabulary_size, Index new_sequence_length, Index new_embedding_dimension)
bool scale_embedding
Definition operators.h:642
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Index sequence_length
Definition operators.h:639
vector< pair< Shape, Type > > state_specs() const override
void apply_delta(const TensorView &indices, const TensorView &output_delta) const
Index vocabulary_size
Definition operators.h:638
void apply(const TensorView &indices, TensorView &output)
TensorView weights
Definition operators.h:647
void set_parameters_glorot() override
void link_states(const vector< TensorView > &views) override
float embedding_scale
Definition operators.h:645
Index embedding_dimension
Definition operators.h:640
vector< pair< Shape, Type > > parameter_specs() const override
void link_gradients(const vector< TensorView > &views) override
TensorView weight_gradient
Definition operators.h:650
bool add_positional_encoding
Definition operators.h:643
TensorView positional_encoding
Definition operators.h:648
void set_parameters_random() override
Definition enum_map.h:18
Definition operators.h:680
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Definition forward_propagation.h:19
Definition operators.h:362
TensorView beta_gradient
Definition operators.h:370
void apply(const TensorView &input, TensorView &means, TensorView &standard_deviations, TensorView &normalized, TensorView &output)
Index sequence_length
Definition operators.h:363
TensorView gamma
Definition operators.h:366
vector< pair< Shape, Type > > parameter_specs() const override
Index embedding_dimension
Definition operators.h:364
TensorView gamma_gradient
Definition operators.h:369
TensorView beta
Definition operators.h:367
void set(Index sequence_length, Index embedding_dimension)
void set_parameters_glorot() override
Definition operators.h:379
void apply_delta(const TensorView &input, const TensorView &output_delta, const TensorView &means, const TensorView &standard_deviations, const TensorView &normalized, TensorView &input_delta) const
void link_gradients(const vector< TensorView > &views) override
void set_parameters_random() override
Definition operators.h:378
void link_parameters(const vector< TensorView > &views) override
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Definition operators.h:415
void apply_delta(const TensorView &head_gradient, const TensorView &input, TensorView &input_gradient, bool accumulate, float *scratch) const
void link_parameters(const vector< TensorView > &views) override
Definition operators.h:425
void link_gradients(const vector< TensorView > &views) override
Definition operators.h:426
void set_parameters_glorot() override
Definition operators.h:429
Combination combination
Definition operators.h:416
vector< pair< Shape, Type > > parameter_specs() const override
Definition operators.h:424
void set(Index input_features, Index heads_number, Index head_dimension, Type compute_dtype)
Index head_dimension
Definition operators.h:419
Type compute_dtype
Definition operators.h:420
Index heads_number
Definition operators.h:418
void apply(const TensorView &input, TensorView &head_output, float *scratch)
Index input_features
Definition operators.h:417
void set_parameters_random() override
Definition operators.h:428
Definition operators.h:27
virtual void destroy_cuda()
Definition operators.h:46
virtual void link_states(const vector< TensorView > &)
Definition operators.h:35
virtual void link_parameters(const vector< TensorView > &)
Definition operators.h:33
virtual void from_JSON(const Json *)
Definition operators.h:43
virtual void to_JSON(JsonWriter &) const
Definition operators.h:42
virtual void load_state_from_JSON(const Json *)
Definition operators.h:44
vector< size_t > output_slots
Definition operators.h:49
virtual vector< pair< Shape, Type > > state_specs() const
Definition operators.h:31
virtual vector< pair< Shape, Type > > parameter_specs() const
Definition operators.h:30
virtual void forward_propagate(ForwardPropagation &, size_t, bool) noexcept
Definition operators.h:40
virtual void set_parameters_glorot()
Definition operators.h:38
virtual ~Operator()=default
vector< size_t > input_slots
Definition operators.h:48
virtual void link_gradients(const vector< TensorView > &)
Definition operators.h:34
virtual void set_parameters_random()
Definition operators.h:37
Definition operators.h:626
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
int method
Definition operators.h:627
~Pool() override
Definition operators.h:593
int method
Definition operators.h:579
Index pool_width
Definition operators.h:573
Pool(const Pool &)=delete
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
Pool & operator=(const Pool &)=delete
Pool()=default
Index input_width
Definition operators.h:569
void apply_delta(const TensorView &input, const TensorView &output, const TensorView &output_delta, const TensorView &maximal_indices, TensorView &input_delta) const
Index input_height
Definition operators.h:568
Index row_stride
Definition operators.h:574
void set(Index input_h, Index input_w, Index input_c, Index pool_h, Index pool_w, Index row_stride, Index column_stride, Index padding_h, Index padding_w, int method)
void apply(const TensorView &input, TensorView &output, TensorView &maximal_indices, bool is_training)
Index pool_height
Definition operators.h:572
void destroy_cuda() override
Index column_stride
Definition operators.h:575
Index input_channels
Definition operators.h:570
Index padding_width
Definition operators.h:577
Index padding_height
Definition operators.h:576
Definition operators.h:705
TensorView scalers
Definition operators.h:714
Index features
Definition operators.h:706
TensorView minimums
Definition operators.h:710
void link_states(const vector< TensorView > &views) override
float min_range
Definition operators.h:707
void load_state_from_JSON(const Json *parent) override
void set(Index new_features)
TensorView standard_deviations
Definition operators.h:713
TensorView means
Definition operators.h:712
vector< pair< Shape, Type > > state_specs() const override
float max_range
Definition operators.h:708
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
TensorView maximums
Definition operators.h:711
Definition tensor_utilities.h:236
Definition operators.h:727
float min_range
Definition operators.h:729
Index features
Definition operators.h:728
TensorView maximums
Definition operators.h:733
TensorView standard_deviations
Definition operators.h:735
float max_range
Definition operators.h:730
void link_states(const vector< TensorView > &views) override
void load_state_from_JSON(const Json *parent) override
TensorView scalers
Definition operators.h:736
vector< pair< Shape, Type > > state_specs() const override
void forward_propagate(ForwardPropagation &fp, size_t layer, bool is_training) noexcept override
TensorView minimums
Definition operators.h:732
void set(Index new_features)
TensorView means
Definition operators.h:734