data_set.h
1// OpenNN: Open Neural Networks Library
2// www.opennn.net
3//
4// D A T A S E T C L A S S H E A D E R
5//
6// Artificial Intelligence Techniques SL
7// artelnics@artelnics.com
8
9#ifndef DATASET_H
10#define DATASET_H
11
12// System includes
13
14#include <iostream>
15#include <fstream>
16#include <string>
17#include <sstream>
18#include <cmath>
19#include <algorithm>
20#include <cstdlib>
21#include <stdexcept>
22#include <ctime>
23#include <exception>
24#include <random>
25#include <regex>
26#include <map>
27#include <stdlib.h>
28#include <stdio.h>
29#include <limits.h>
30#include <list>
31
32// OpenNN includes
33
34#include "config.h"
35#include "statistics.h"
36#include "scaling.h"
37#include "correlations.h"
38#include "opennn_strings.h"
39#include "tensor_utilities.h"
40
41using namespace std;
42using namespace Eigen;
43
44namespace OpenNN
45{
46
48
55
57{
58
59public:
60
61 // Constructors
62
63 explicit DataSet();
64
65 explicit DataSet(const Tensor<type, 2>&);
66
67 explicit DataSet(const Index&, const Index&);
68
69 explicit DataSet(const Index&, const Index&, const Index&);
70
71 explicit DataSet(const string&, const char&, const bool&);
72
73 // Destructor
74
75 virtual ~DataSet();
76
77 // Enumerations
78
80
81 enum class Separator{Space, Tab, Comma, Semicolon};
82
84
85 enum class MissingValuesMethod{Unuse, Mean, Median};
86
88
89 enum class ProjectType{Approximation, Classification, Forecasting, ImageClassification};
90
93
94 enum class SampleUse{Training, Selection, Testing, UnusedSample};
95
98
99 enum class VariableUse{Id, Input, Target, Time, UnusedVariable};
100
103
104 enum class ColumnType{Numeric, Binary, Categorical, DateTime, Constant};
105
106 // Structs
107
109
110 struct Column
111 {
113
114 Column();
115
117
118 Column(const string&,
119 const VariableUse&,
120 const ColumnType& = ColumnType::Numeric,
121 const Scaler& = Scaler::MeanStandardDeviation,
122 const Tensor<string, 1>& = Tensor<string, 1>(),
123 const Tensor<VariableUse, 1>& = Tensor<VariableUse, 1>());
124
126
127 virtual ~Column();
128
130
131 string name;
132
134
136
138
140
142
143 Tensor<string, 1> categories;
144
146
147 Tensor<VariableUse, 1> categories_uses;
148
149 Scaler scaler;
150
151 // Methods
152
153 Index get_variables_number() const;
154
155 Index get_categories_number() const;
156 Index get_used_categories_number() const;
157
158 Tensor<string, 1> get_used_variables_names() const;
159
160 void set_scaler(const Scaler&);
161 void set_scaler(const string&);
162
163 void set_use(const VariableUse&);
164 void set_use(const string&);
165
166 void set_type(const string&);
167
168 void add_category(const string&);
169
170 void set_categories_uses(const Tensor<string, 1>&);
171 void set_categories_uses(const VariableUse&);
172
173 bool is_used();
174 bool is_unused();
175
176 void from_XML(const tinyxml2::XMLDocument&);
177 void write_XML(tinyxml2::XMLPrinter&) const;
178
179 void print() const;
180 };
181
182 // Samples get methods
183
184 inline Index get_samples_number() const {return samples_uses.size();}
185
186 Index get_training_samples_number() const;
187 Index get_selection_samples_number() const;
188 Index get_testing_samples_number() const;
189
190 Index get_used_samples_number() const;
191 Index get_unused_samples_number() const;
192
193 Tensor<Index, 1> get_training_samples_indices() const;
194 Tensor<Index, 1> get_selection_samples_indices() const;
195 Tensor<Index, 1> get_testing_samples_indices() const;
196
197 Tensor<Index, 1> get_used_samples_indices() const;
198 Tensor<Index, 1> get_unused_samples_indices() const;
199
200 SampleUse get_sample_use(const Index&) const;
201 const Tensor<SampleUse, 1>& get_samples_uses() const;
202
203 Tensor<Index, 1> get_samples_uses_numbers() const;
204 Tensor<type, 1> get_samples_uses_percentages() const;
205
206 string get_sample_string(const Index&, const string& = ",") const;
207
208 // Columns get methods
209
210 Tensor<Column, 1> get_columns() const;
211 Tensor<Column, 1> get_time_series_columns() const;
212 Index get_time_series_data_rows_number() const;
213 Tensor<Column, 1> get_input_columns() const;
214 Tensor<bool, 1> get_input_columns_binary() const;
215 Tensor<Column, 1> get_target_columns() const;
216 Tensor<Column, 1> get_used_columns() const;
217
218 Index get_columns_number() const;
219
220 Index get_input_columns_number() const;
221 Index get_input_time_series_columns_number() const;
222 Index get_target_columns_number() const;
223 Index get_target_time_series_columns_number() const;
224 Index get_time_columns_number() const;
225 Index get_unused_columns_number() const;
226 Index get_used_columns_number() const;
227
228 Index get_column_index(const string&) const;
229 Index get_column_index(const Index&) const;
230
231 Tensor<Index, 1> get_input_columns_indices() const;
232 Tensor<Index, 1> get_input_time_series_columns_indices() const;
233 Tensor<Index, 1> get_target_columns_indices() const;
234 Tensor<Index, 1> get_target_time_series_columns_indices() const;
235 Tensor<Index, 1> get_unused_columns_indices() const;
236 Tensor<Index, 1> get_used_columns_indices() const;
237
238 Tensor<string, 1> get_columns_names() const;
239
240 Tensor<string, 1> get_input_columns_names() const;
241 Tensor<string, 1> get_target_columns_names() const;
242 Tensor<string, 1> get_used_columns_names() const;
243
244 ColumnType get_column_type(const Index& index) const {return columns[index].type;}
245
246 VariableUse get_column_use(const Index& ) const;
247 Tensor<VariableUse, 1> get_columns_uses() const;
248
249 // Variables get methods
250
251 Index get_variables_number() const;
253
254 Index get_input_variables_number() const;
255 Index get_target_variables_number() const;
256 Index get_unused_variables_number() const;
257 Index get_used_variables_number() const;
258
259 string get_variable_name(const Index&) const;
260 Tensor<string, 1> get_variables_names() const;
261 Tensor<string, 1> get_time_series_variables_names() const;
262
263 Tensor<string, 1> get_input_variables_names() const;
264 Tensor<string, 1> get_target_variables_names() const;
265
266 Index get_variable_index(const string&name) const;
267
268 Tensor<Index, 1> get_variable_indices(const Index&) const;
269 Tensor<Index, 1> get_unused_variables_indices() const;
270 Tensor<Index, 1> get_used_variables_indices() const;
271 Tensor<Index, 1> get_input_variables_indices() const;
272 Tensor<Index, 1> get_target_variables_indices() const;
273
274 VariableUse get_variable_use(const Index&) const;
275 Tensor<VariableUse, 1> get_variables_uses() const;
276
277 const Tensor<Index, 1>& get_input_variables_dimensions() const;
278 Index get_input_variables_rank() const;
279
280 // Scalers get methods
281
282 Tensor<Scaler, 1> get_columns_scalers() const;
283
284 Tensor<Scaler, 1> get_input_variables_scalers() const;
285 Tensor<Scaler, 1> get_target_variables_scalers() const;
286
287 // Batches get methods
288
289 Tensor<Index, 2> get_batches(const Tensor<Index,1>&, const Index&, const bool&, const Index& buffer_size= 100) const;
290
291 // Data get methods
292
293 const Tensor<type, 2>& get_data() const;
294 Tensor<type, 2>* get_data_pointer();
295
296 const Tensor<type, 2>& get_time_series_data() const;
297
298 Tensor<type, 2> get_training_data() const;
299 Tensor<type, 2> get_selection_data() const;
300 Tensor<type, 2> get_testing_data() const;
301 Tensor<string, 1> get_time_series_columns_names() const;
302 Index get_time_series_columns_number() const;
303
304 Tensor<type, 2> get_input_data() const;
305 Tensor<type, 2> get_target_data() const;
306
307 Tensor<type, 2> get_input_data(const Tensor<Index, 1>&) const;
308 Tensor<type, 2> get_target_data(const Tensor<Index, 1>&) const;
309
310 Tensor<type, 2> get_training_input_data() const;
311 Tensor<type, 2> get_training_target_data() const;
312
313 Tensor<type, 2> get_selection_input_data() const;
314 Tensor<type, 2> get_selection_target_data() const;
315
316 Tensor<type, 2> get_testing_input_data() const;
317 Tensor<type, 2> get_testing_target_data() const;
318
319 Tensor<type, 1> get_sample_data(const Index&) const;
320 Tensor<type, 1> get_sample_data(const Index&, const Tensor<Index, 1>&) const;
321 Tensor<type, 2> get_sample_input_data(const Index&) const;
322 Tensor<type, 2> get_sample_target_data(const Index&) const;
323
324 Tensor<type, 2> get_column_data(const Index&) const;
325 Tensor<type, 2> get_column_data(const Index&, const Tensor<Index, 1>&) const;
326 Tensor<type, 2> get_column_data(const Tensor<Index, 1>&) const;
327 Tensor<type, 2> get_column_data(const string&) const;
328
329 Tensor<type, 1> get_variable_data(const Index&) const;
330 Tensor<type, 1> get_variable_data(const string&) const;
331
332 Tensor<type, 1> get_variable_data(const Index&, const Tensor<Index, 1>&) const;
333 Tensor<type, 1> get_variable_data(const string&, const Tensor<Index, 1>&) const;
334
335 Tensor<Tensor<string, 1>, 1> get_data_file_preview() const;
336
337 Tensor<type, 2> get_subtensor_data(const Tensor<Index, 1>&, const Tensor<Index, 1>&) const;
338
339 // Members get methods
340
342
343 const string& get_data_file_name() const;
344
345 const bool& get_header_line() const;
346 const bool& get_rows_label() const;
347
348 Tensor<string, 1> get_rows_label_tensor() const;
349 Tensor<string, 1> get_selection_rows_label_tensor();
350 Tensor<string, 1> get_testing_rows_label_tensor();
351
352 const Separator& get_separator() const;
353 char get_separator_char() const;
354 string get_separator_string() const;
355
356 const string& get_missing_values_label() const;
357
358 const Index& get_lags_number() const;
359 const Index& get_steps_ahead() const;
360 const string& get_time_column() const;
361 Index get_time_series_time_column_index() const;
362
363 static Tensor<string, 1> get_default_columns_names(const Index&);
364
365 static Scaler get_scaling_unscaling_method(const string&);
366
367 Index get_gmt() const;
368
369 const bool& get_display() const;
370
371 // Set methods
372
373 void set();
374 void set(const Tensor<type, 2>&);
375 void set(const Index&, const Index&);
376 void set(const Index&, const Index&, const Index&);
377 void set(const DataSet&);
378 void set(const tinyxml2::XMLDocument&);
379 void set(const string&);
380 void set(const string&, const char&, const bool&);
381
382 void set_default();
383
384 void set_threads_number(const int&);
385
386 // Samples set methods
387
388 void set_samples_number(const Index&);
389
390 void set_training();
391 void set_selection();
392 void set_testing();
393
394 void set_training(const Tensor<Index, 1>&);
395 void set_selection(const Tensor<Index, 1>&);
396 void set_testing(const Tensor<Index, 1>&);
397
398 void set_samples_unused();
399 void set_samples_unused(const Tensor<Index, 1>&);
400
401 void set_sample_use(const Index&, const SampleUse&);
402 void set_sample_use(const Index&, const string&);
403
404 void set_samples_uses(const Tensor<SampleUse, 1>&);
405 void set_samples_uses(const Tensor<string, 1>&);
406
407 // Columns set methods
408
409 void set_columns(const Tensor<Column, 1>&);
410
412
414
415 void set_column_name(const Index&, const string&);
416
417 void set_columns_uses(const Tensor<string, 1>&);
418 void set_columns_uses(const Tensor<VariableUse, 1>&);
419 void set_columns_unused();
420 void set_input_target_columns(const Tensor<Index, 1>&, const Tensor<Index, 1>&);
422
423 void set_input_columns(const Tensor<Index, 1>&, const Tensor<bool, 1>&);
424
425 void set_column_use(const Index&, const VariableUse&);
426 void set_column_use(const string&, const VariableUse&);
427
428 void set_column_type(const Index&, const ColumnType&);
429 void set_column_type(const string&, const ColumnType&);
430
431 void set_columns_names(const Tensor<string, 1>&);
432
433 void set_columns_number(const Index&);
434
435 void set_columns_scalers(const Scaler&);
436
437
438 void set_binary_simple_columns();
439
440 // Columns other methods
441
442 void check_constant_columns();
443
444 Tensor<type, 2> transform_binary_column(const Tensor<type, 1>&) const;
445
446 // Variables set methods
447
448 void set_variables_names(const Tensor<string, 1>&);
449 void set_variable_name(const Index&, const string&);
450
451 void set_input();
452 void set_target();
454
455 void set_input_variables_dimensions(const Tensor<Index, 1>&);
456
457 // Data set methods
458
459 void set_data(const Tensor<type, 2>&);
460
461 // Members set methods
462
463 void set_data_file_name(const string&);
464
465 void set_has_columns_names(const bool&);
466 void set_has_rows_label(const bool&);
467
468 void set_separator(const Separator&);
469 void set_separator(const string&);
470 void set_separator(const char&);
471
472 void set_missing_values_label(const string&);
474 void set_missing_values_method(const string&);
475
476 void set_lags_number(const Index&);
477 void set_steps_ahead_number(const Index&);
478 void set_time_column(const string&);
479
480 void set_gmt(Index&);
481
482 void set_display(const bool&);
483
484 // Check methods
485
486 bool is_empty() const;
487
488 bool is_sample_used(const Index&) const;
489 bool is_sample_unused(const Index&) const;
490
491 bool has_binary_columns() const;
492 bool has_categorical_columns() const;
493 bool has_time_columns() const;
494 bool has_time_time_series_columns() const;
495
496 bool has_selection() const;
497
498 // Splitting methods
499
500 void split_samples_sequential(const type& training_ratio = static_cast<type>(0.6),
501 const type& selection_ratio = static_cast<type>(0.2),
502 const type& testing_ratio = static_cast<type>(0.2));
503
504 void split_samples_random(const type& training_ratio = static_cast<type>(0.6),
505 const type& selection_ratio = static_cast<type>(0.2),
506 const type& testing_ratio = static_cast<type>(0.2));
507
508 // Unusing methods
509
510 Tensor<string, 1> unuse_constant_columns();
511
512 Tensor<Index, 1> unuse_repeated_samples();
513
514 Tensor<string, 1> unuse_uncorrelated_columns(const type& = type(0.25));
515
516 // Initialization methods
517
518 void set_data_constant(const type&);
519
520 void set_data_random();
522
523 // Descriptives methods
524
525 Tensor<Descriptives, 1> calculate_variables_descriptives() const;
526 Tensor<Descriptives, 1> calculate_used_variables_descriptives() const;
527
528 Tensor<Descriptives, 1> calculate_columns_descriptives_positive_samples() const;
529 Tensor<Descriptives, 1> calculate_columns_descriptives_negative_samples() const;
530 Tensor<Descriptives, 1> calculate_columns_descriptives_categories(const Index&) const;
531
532 Tensor<Descriptives, 1> calculate_columns_descriptives_training_samples() const;
533 Tensor<Descriptives, 1> calculate_columns_descriptives_selection_samples() const;
534
535 Tensor<Descriptives, 1> calculate_input_variables_descriptives() const;
536 Tensor<Descriptives, 1> calculate_target_variables_descriptives() const;
537
538 Tensor<Descriptives, 1> calculate_testing_target_variables_descriptives() const;
539
540 Tensor<type, 1> calculate_input_variables_minimums() const;
541 Tensor<type, 1> calculate_target_variables_minimums() const;
542 Tensor<type, 1> calculate_input_variables_maximums() const;
543 Tensor<type, 1> calculate_target_variables_maximums() const;
544
545 Tensor<type, 1> calculate_variables_means(const Tensor<Index, 1>&) const;
546 Tensor<type, 1> calculate_used_variables_minimums() const;
547
548 Tensor<type, 1> calculate_used_targets_mean() const;
549 Tensor<type, 1> calculate_selection_targets_mean() const;
550
551 Index calculate_used_negatives(const Index&) const;
552 Index calculate_training_negatives(const Index&) const;
553 Index calculate_selection_negatives(const Index&) const;
554 Index calculate_testing_negatives(const Index&) const;
555
556 // Distribution methods
557
558 Tensor<Histogram, 1> calculate_columns_distribution(const Index& = 10) const;
559
560 // Box and whiskers
561
562 Tensor<BoxPlot, 1> calculate_columns_box_plots() const;
563
564 // Inputs correlations
565
566 Tensor<Correlation, 2> calculate_input_columns_correlations() const;
567
568 void print_inputs_correlations() const;
569
571
572 // Inputs-targets correlations
573
574 Tensor<Correlation, 2> calculate_input_target_columns_correlations() const;
575
577
579
580 // Filtering methods
581
582 Tensor<Index, 1> filter_data(const Tensor<type, 1>&, const Tensor<type, 1>&);
583
584 // Scaling methods
585
587
588 // Data scaling
589
590 Tensor<Descriptives, 1> scale_data();
591
592 Tensor<Descriptives, 1> scale_input_variables();
593 Tensor<Descriptives, 1> scale_target_variables();
594
595 // Data unscaling
596
597 void unscale_data(const Tensor<Descriptives, 1>&);
598
599 void unscale_input_variables(const Tensor<Descriptives, 1>&);
600 void unscale_target_variables(const Tensor<Descriptives, 1>&);
601
602 // Classification methods
603
604 Tensor<Index, 1> calculate_target_distribution() const;
605
606 // Tuckey outlier detection
607
608 Tensor<Tensor<Index, 1>, 1> calculate_Tukey_outliers(const type& = type(1.5)) const;
609
610 void unuse_Tukey_outliers(const type& = type(1.5));
611
612 // Local outlier factor
613
614 Tensor<Index, 1> calculate_local_outlier_factor_outliers(const Index& = 20, const Index& = 0, const type& = type(0)) const;
615
616 void unuse_local_outlier_factor_outliers(const Index& = 20, const type& = type(1.5));
617
618 // Isolation Forest outlier
619
620 Tensor<Index, 1> calculate_isolation_forest_outliers(const Index& = 100, const Index& = 256, const type& = type(0)) const;
621
622 void unuse_isolation_forest_outliers(const Index& = 20, const type& = type(1.5));
623
624 // Time series methods
625
627
629 void transform_time_series_data();
630 void get_time_series_columns_number(const Index&);
631 void set_time_series_data(const Tensor<type, 2>&);
632 void set_time_series_columns_number(const Index&);
633
634 Tensor<type, 2> get_time_series_column_data(const Index&) const;
635 Tensor<type, 2> calculate_autocorrelations(const Index& = 10) const;
636 Tensor<type, 3> calculate_cross_correlations(const Index& = 10) const;
637
638 // Data generation
639
640 void generate_constant_data(const Index&, const Index&, const type&);
641 void generate_random_data(const Index&, const Index&);
642 void generate_sequential_data(const Index&, const Index&);
643 void generate_Rosenbrock_data(const Index&, const Index&);
644 void generate_sum_data(const Index&, const Index&);
645
646 // Serialization methods
647
648 void print() const;
649
650 void from_XML(const tinyxml2::XMLDocument&);
651 void write_XML(tinyxml2::XMLPrinter&) const;
652
653 void save(const string&) const;
654 void load(const string&);
655
656 void print_columns() const;
657 void print_columns_types() const;
658 void print_columns_uses() const;
659
660 void print_data() const;
661 void print_data_preview() const;
662
663 void print_data_file_preview() const;
664
665 void save_data() const;
666
667 void save_data_binary(const string&) const;
668 void save_time_series_data_binary(const string&) const;
669
670 // Data load methods
671
672 void read_csv();
673
674 void load_data_binary();
675 void load_time_series_data_binary(const string&);
676
677 void check_input_csv(const string&, const char&) const;
678 Tensor<type, 2> read_input_csv(const string&, const char&, const string&, const bool&, const bool&) const;
679
680 // Trasform methods
681
682 void fill_time_series(const Index&);
683
684 // Missing values
685
686 bool has_nan() const;
687
688 bool has_nan_row(const Index&) const;
689
691
695
697
698 Tensor<Index, 1> count_nan_columns() const;
699 Index count_rows_with_nan() const;
700 Index count_nan() const;
701
702 void set_missing_values_number(const Index&);
703 void set_missing_values_number();
704
705 void set_columns_missing_values_number(const Tensor<Index, 1>&);
706 void set_columns_missing_values_number();
707
708 void set_rows_missing_values_number(const Index&);
709 void set_rows_missing_values_number();
710
711 // Other methods
712
713 void fix_repeated_names();
714
715 // Eigen methods
716
717 Tensor<Index, 1> push_back(const Tensor<Index, 1>&, const Index&) const;
718 Tensor<string, 1> push_back(const Tensor<string, 1>&, const string&) const;
719
720 void initialize_sequential(Tensor<Index, 1>&, const Index&, const Index&, const Index&) const;
721 void intialize_sequential(Tensor<type, 1>&, const type&, const type&, const type&) const;
722
723 Tensor<Index, 2> split_samples(const Tensor<Index, 1>&, const Index&) const;
724
725 bool get_has_rows_labels() const;
726
727 void shuffle();
728
729 // Reader
730
731 void read_csv_1();
732
733 void read_csv_2_simple();
734 void read_csv_3_simple();
735
736 void read_csv_2_complete();
737 void read_csv_3_complete();
738
739 void check_separators(const string&) const;
740
741 void check_special_characters(const string&) const;
742
743private:
744
745 NonBlockingThreadPool* non_blocking_thread_pool = nullptr;
746 ThreadPoolDevice* thread_pool_device = nullptr;
747
748 // DATA
749
753
754 Tensor<type, 2> data;
755
756 // Samples
757
758 Tensor<SampleUse, 1> samples_uses;
759
760 Tensor<string, 1> rows_labels;
761
762 // Columns
763
764 Tensor<Column, 1> columns;
765
766 Tensor<Index, 1> input_variables_dimensions;
767
768 // DATA FILE
769
771
773
775
776 Separator separator = Separator::Comma;
777
779
780 string missing_values_label = "NA";
781
783
784 bool has_columns_names = false;
785
787
788 bool has_rows_labels = false;
789
790 Tensor<Tensor<string, 1>, 1> data_file_preview;
791
792 // TIME SERIES
793
795
797
799
800 Index lags_number = 0;
801
803
804 Index steps_ahead = 0;
805
809
810 Tensor<type, 2> time_series_data;
811
812 Tensor<Column, 1> time_series_columns;
813
814 Index gmt = 0;
815
816 // MISSING VALUES
817
819
820 MissingValuesMethod missing_values_method = MissingValuesMethod::Unuse;
821
823
825
826 Tensor<Index, 1> columns_missing_values_number;
827
828 Index rows_missing_values_number;
829
831
832 bool display = true;
833
834 const Eigen::array<IndexPair<Index>, 1> product_vector_vector = {IndexPair<Index>(0, 0)}; // Vector product, (0,0) first vector is transpose
835
836 // Local Outlier Factor
837
838 Tensor<Index, 1> select_outliers_via_standard_deviation(const Tensor<type, 1>&, const type & = type(2.0), bool = true) const;
839
840 Tensor<Index, 1> select_outliers_via_contamination(const Tensor<type, 1>&, const type & = type(0.05), bool = true) const;
841
842 type calculate_euclidean_distance(const Tensor<Index, 1>&, const Index&, const Index&) const;
843
844 Tensor<type, 2> calculate_distance_matrix(const Tensor<Index, 1>&) const;
845
846 Tensor<list<Index>, 1> calculate_k_nearest_neighbors(const Tensor<type, 2>&, const Index& = 20) const;
847
848 Tensor<Tensor<type, 1>, 1> get_kd_tree_data() const;
849
850 Tensor<Tensor<Index, 1>, 1> create_bounding_limits_kd_tree(const Index&) const;
851
852 void create_kd_tree(Tensor<Tensor<type, 1>, 1>&, const Tensor<Tensor<Index, 1>, 1>&) const;
853
854 Tensor<list<Index>, 1> calculate_bounding_boxes_neighbors(const Tensor<Tensor<type, 1>, 1>&,
855 const Tensor<Index, 1>&,
856 const Index&, const Index&) const;
857
858 Tensor<list<Index>, 1> calculate_kd_tree_neighbors(const Index& = 20, const Index& = 40) const;
859
860 Tensor<type, 1> calculate_average_reachability(Tensor<list<Index>, 1>&, const Index&) const;
861
862 Tensor<type, 1> calculate_local_outlier_factor(Tensor<list<Index>, 1>&, const Tensor<type, 1>&, const Index &) const;
863
864
865 //Isolation Forest
866
867 void calculate_min_max_indices_list(list<Index>&, const Index&, type&, type&) const;
868
869 Index split_isolation_tree(Tensor<type, 2>&, list<list<Index>>&, list<Index>&) const;
870
871 Tensor<type, 2> create_isolation_tree(const Tensor<Index, 1>&, const Index&) const;
872
873 Tensor<Tensor<type, 2>, 1> create_isolation_forest(const Index&, const Index&, const Index&) const;
874
875 type calculate_tree_path(const Tensor<type, 2>&, const Index&, const Index&) const;
876
877 Tensor<type, 1> calculate_average_forest_paths(const Tensor<Tensor<type, 2>, 1>&, const Index&) const;
878
879};
880
881
882#ifdef OPENNN_CUDA
883 #include "../../opennn-cuda/opennn-cuda/data_set_cuda.h"
884#endif
885
887{
889
891
892 DataSetBatch(const Index&, DataSet*);
893
895
896 virtual ~DataSetBatch() {}
897
898 Index get_samples_number() const;
899
900 void set(const Index&, DataSet*);
901
902 void fill(const Tensor<Index, 1>&, const Tensor<Index, 1>&, const Tensor<Index, 1>&);
903
904 void print() const;
905
906 Index samples_number = 0;
907
908 DataSet* data_set_pointer = nullptr;
909
910 Tensor<type, 2> inputs_2d;
911 Tensor<type, 4> inputs_4d;
912
913 Tensor<type, 2> targets_2d;
914
915};
916
917
918}
919
920#endif
921
922// OpenNN: Open Neural Networks Library.
923// Copyright(C) 2005-2021 Artificial Intelligence Techniques, SL.
924//
925// This library is free software; you can redistribute it and/or
926// modify it under the terms of the GNU Lesser General Public
927// License as published by the Free Software Foundation; either
928// version 2.1 of the License, or any later version.
929//
930// This library is distributed in the hope that it will be useful,
931// but WITHOUT ANY WARRANTY; without even the implied warranty of
932// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
933// Lesser General Public License for more details.
934
935// You should have received a copy of the GNU Lesser General Public
936// License along with this library; if not, write to the Free Software
937// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
This class represents the concept of data set for data modelling problems, such as approximation,...
Definition: data_set.h:57
Index calculate_testing_negatives(const Index &) const
Definition: data_set.cpp:5468
bool is_sample_used(const Index &) const
Definition: data_set.cpp:910
ProjectType
Enumeration of the learning tasks.
Definition: data_set.h:89
Tensor< type, 1 > calculate_input_variables_maximums() const
Returns a vector containing the maximums of the input variables.
Definition: data_set.cpp:5784
void transform_time_series_columns()
This method transforms the columns into time series for forecasting problems.
Definition: data_set.cpp:790
Tensor< type, 2 > get_target_data() const
Definition: data_set.cpp:3972
Index get_training_samples_number() const
Returns the number of samples in the data set which will be used for training.
Definition: data_set.cpp:1382
Tensor< type, 1 > calculate_used_variables_minimums() const
Returns a vector containing the maximum of the used variables.
Definition: data_set.cpp:5800
Tensor< type, 2 > calculate_autocorrelations(const Index &=10) const
Definition: data_set.cpp:9315
VariableUse get_column_use(const Index &) const
Returns a vector containing the use of the column, without taking into account the categories.
Definition: data_set.cpp:2000
Index get_target_variables_number() const
Returns the number of target variables of the data set.
Definition: data_set.cpp:2884
Tensor< Descriptives, 1 > calculate_columns_descriptives_training_samples() const
Definition: data_set.cpp:5693
void set_sample_use(const Index &, const SampleUse &)
Definition: data_set.cpp:1591
void set_training()
Sets all the samples in the data set for training.
Definition: data_set.cpp:1475
void impute_missing_values_unuse()
Sets all the samples with missing values to "Unused".
Definition: data_set.cpp:9753
Tensor< Descriptives, 1 > calculate_columns_descriptives_positive_samples() const
Calculate the descriptives of the samples with positive targets in binary classification problems.
Definition: data_set.cpp:5525
void save_data_binary(const string &) const
Saves to the data file the values of the data matrix in binary format.
Definition: data_set.cpp:7909
Tensor< string, 1 > get_target_columns_names() const
Returns a string vector which contains the names of the columns whose uses are Target.
Definition: data_set.cpp:2528
Tensor< string, 1 > get_columns_names() const
Returns a string vector that contains the names of the columns.
Definition: data_set.cpp:2473
Tensor< Descriptives, 1 > calculate_input_variables_descriptives() const
Definition: data_set.cpp:5726
void set_variable_name(const Index &, const string &)
Definition: data_set.cpp:3295
Tensor< Descriptives, 1 > scale_target_variables()
Definition: data_set.cpp:6298
Tensor< Index, 1 > calculate_local_outlier_factor_outliers(const Index &=20, const Index &=0, const type &=type(0)) const
Definition: data_set.cpp:9002
void set_input_variables_dimensions(const Tensor< Index, 1 > &)
Sets new input dimensions in the data set.
Definition: data_set.cpp:3650
Tensor< Index, 1 > get_training_samples_indices() const
Returns the indices of the samples which will be used for training.
Definition: data_set.cpp:1073
void set_input()
Sets all the variables in the data set as input variables.
Definition: data_set.cpp:3428
bool has_columns_names
Header which contains variables name.
Definition: data_set.h:784
void generate_constant_data(const Index &, const Index &, const type &)
Definition: data_set.cpp:9558
void generate_Rosenbrock_data(const Index &, const Index &)
Definition: data_set.cpp:9607
void set_data_random()
Definition: data_set.cpp:6449
Tensor< string, 1 > get_time_series_variables_names() const
Definition: data_set.cpp:2151
void print_top_input_target_columns_correlations() const
Definition: data_set.cpp:5983
void set_has_columns_names(const bool &)
Sets if the data file contains a header with the names of the columns.
Definition: data_set.cpp:4866
Tensor< Index, 1 > get_testing_samples_indices() const
Returns the indices of the samples which will be used for testing.
Definition: data_set.cpp:1123
void set_columns_number(const Index &)
Definition: data_set.cpp:3465
Tensor< Index, 1 > get_selection_samples_indices() const
Returns the indices of the samples which will be used for selection.
Definition: data_set.cpp:1098
Tensor< type, 1 > calculate_selection_targets_mean() const
Returns the mean values of the target variables on the selection.
Definition: data_set.cpp:5843
const bool & get_display() const
Definition: data_set.cpp:94
void set_selection()
Sets all the samples in the data set for selection.
Definition: data_set.cpp:1488
Tensor< Histogram, 1 > calculate_columns_distribution(const Index &=10) const
Definition: data_set.cpp:5201
const Tensor< type, 2 > & get_data() const
Definition: data_set.cpp:3673
Index get_variables_number() const
Returns the number of variables in the data set.
Definition: data_set.cpp:2814
void set_data_constant(const type &)
Definition: data_set.cpp:6440
Tensor< type, 2 > get_sample_target_data(const Index &) const
Definition: data_set.cpp:4181
void set_samples_unused()
Sets all the samples in the data set for unused.
Definition: data_set.cpp:1562
Tensor< type, 3 > calculate_cross_correlations(const Index &=10) const
Calculates the cross-correlation between all the variables in the data set.
Definition: data_set.cpp:9424
Tensor< Index, 1 > get_target_columns_indices() const
Returns a indices vector with the positions of the targets.
Definition: data_set.cpp:2315
void split_samples_sequential(const type &training_ratio=static_cast< type >(0.6), const type &selection_ratio=static_cast< type >(0.2), const type &testing_ratio=static_cast< type >(0.2))
Definition: data_set.cpp:1834
void unscale_input_variables(const Tensor< Descriptives, 1 > &)
Definition: data_set.cpp:6351
Index lags_number
Number of lags.
Definition: data_set.h:800
Tensor< Descriptives, 1 > calculate_target_variables_descriptives() const
Definition: data_set.cpp:5745
void set_target()
Sets all the variables in the data set as target variables.
Definition: data_set.cpp:3441
void impute_missing_values_mean()
Substitutes all the missing values by the mean of the corresponding variable.
Definition: data_set.cpp:9768
void set_default()
Definition: data_set.cpp:4796
string data_file_name
Data file name.
Definition: data_set.h:772
Tensor< Descriptives, 1 > calculate_variables_descriptives() const
Definition: data_set.cpp:5499
void check_input_csv(const string &, const char &) const
This method checks if the input data file has the correct format. Returns an error message.
Definition: data_set.cpp:8111
char get_separator_char() const
Returns the string which will be used as separator in the data file.
Definition: data_set.cpp:3771
const bool & get_rows_label() const
Returns true if the data file has rows label, and false otherwise.
Definition: data_set.cpp:3720
Tensor< Tensor< Index, 1 >, 1 > calculate_Tukey_outliers(const type &=type(1.5)) const
Definition: data_set.cpp:8540
void load(const string &)
Definition: data_set.cpp:7716
void load_data_binary()
This method loads the data from a binary data file.
Definition: data_set.cpp:8023
void set_samples_uses(const Tensor< SampleUse, 1 > &)
Definition: data_set.cpp:1637
string missing_values_label
Missing values label.
Definition: data_set.h:780
const Index & get_lags_number() const
Returns the number of lags to be used in a time series prediction application.
Definition: data_set.cpp:3825
void set_variables_unused()
Sets all the variables in the data set as unused variables.
Definition: data_set.cpp:3452
void load_time_series_data_binary(const string &)
This method loads time series data from a binary data.
Definition: data_set.cpp:8067
Index calculate_training_negatives(const Index &) const
Definition: data_set.cpp:5398
void set_gmt(Index &)
Definition: data_set.cpp:5865
Tensor< Index, 1 > get_used_samples_indices() const
Returns the indices of the used samples(those which are not set unused).
Definition: data_set.cpp:1148
void set_missing_values_method(const MissingValuesMethod &)
Definition: data_set.cpp:4985
Tensor< type, 1 > get_sample_data(const Index &) const
Definition: data_set.cpp:4093
Tensor< type, 2 > get_sample_input_data(const Index &) const
Definition: data_set.cpp:4163
bool display
Display messages to screen.
Definition: data_set.h:832
void impute_missing_values_median()
Substitutes all the missing values by the median of the corresponding variable.
Definition: data_set.cpp:9802
void set_separator(const Separator &)
Definition: data_set.cpp:4883
Tensor< type, 1 > calculate_input_variables_minimums() const
Returns a vector containing the minimums of the input variables.
Definition: data_set.cpp:5767
string get_sample_string(const Index &, const string &=",") const
Definition: data_set.cpp:1006
Index get_time_series_columns_number() const
Returns the number of columns in the time series.
Definition: data_set.cpp:2807
Index get_unused_samples_number() const
Definition: data_set.cpp:1455
virtual ~DataSet()
Destructor.
Definition: data_set.cpp:84
void print_inputs_correlations() const
Print on screen the correlation between variables in the data set.
Definition: data_set.cpp:6064
Index get_unused_variables_number() const
Returns the number of variables which will neither be used as input nor as target.
Definition: data_set.cpp:2910
Index steps_ahead
Number of steps ahead.
Definition: data_set.h:804
bool is_empty() const
Returns true if the data matrix is empty, and false otherwise.
Definition: data_set.cpp:3658
void set_lags_number(const Index &)
Definition: data_set.cpp:5022
Index get_time_columns_number() const
Returns the number of columns whose uses are Time.
Definition: data_set.cpp:2643
const Tensor< Index, 1 > & get_input_variables_dimensions() const
Returns the dimensions of the input variables.
Definition: data_set.cpp:2245
Tensor< Index, 1 > calculate_target_distribution() const
Definition: data_set.cpp:8480
void set_samples_number(const Index &)
Definition: data_set.cpp:5062
Tensor< Index, 1 > get_input_columns_indices() const
Returns a indices vector with the positions of the inputs.
Definition: data_set.cpp:2271
SampleUse get_sample_use(const Index &) const
Definition: data_set.cpp:1199
Tensor< VariableUse, 1 > get_columns_uses() const
Returns the uses of each columns of the data set.
Definition: data_set.cpp:2008
Tensor< string, 1 > get_used_columns_names() const
Returns a string vector which contains the names of the columns used whether Input,...
Definition: data_set.cpp:2551
void set()
Sets zero samples and zero variables in the data set.
Definition: data_set.cpp:4586
void set_has_rows_label(const bool &)
Sets if the data file contains rows label.
Definition: data_set.cpp:4874
Tensor< Index, 1 > get_target_variables_indices() const
Returns the indices of the target variables.
Definition: data_set.cpp:3094
Tensor< string, 1 > unuse_uncorrelated_columns(const type &=type(0.25))
Definition: data_set.cpp:5163
void set_variables_names(const Tensor< string, 1 > &)
Definition: data_set.cpp:3355
void split_samples_random(const type &training_ratio=static_cast< type >(0.6), const type &selection_ratio=static_cast< type >(0.2), const type &testing_ratio=static_cast< type >(0.2))
Definition: data_set.cpp:1726
MissingValuesMethod
Enumeration of available methods for missing values in the data.
Definition: data_set.h:85
Tensor< Index, 1 > get_samples_uses_numbers() const
Definition: data_set.cpp:943
Tensor< Index, 1 > unuse_repeated_samples()
Definition: data_set.cpp:5112
void set_testing()
Sets all the samples in the data set for testing.
Definition: data_set.cpp:1501
void print_top_inputs_correlations() const
Definition: data_set.cpp:6091
const string & get_missing_values_label() const
Returns the string which will be used as label for the missing values in the data file.
Definition: data_set.cpp:3817
string get_variable_name(const Index &) const
Definition: data_set.cpp:2059
void print_data() const
Prints to the screen the values of the data matrix.
Definition: data_set.cpp:7783
const Separator & get_separator() const
Returns the separator to be used in the data file.
Definition: data_set.cpp:3763
Tensor< type, 1 > calculate_target_variables_minimums() const
Returns a vector containing the minimums of the target variables.
Definition: data_set.cpp:5775
void set_missing_values_label(const string &)
Definition: data_set.cpp:4961
Index get_selection_samples_number() const
Returns the number of samples in the data set which will be used for selection.
Definition: data_set.cpp:1402
const Tensor< SampleUse, 1 > & get_samples_uses() const
Returns the use of every sample (training, selection, testing or unused) in a vector.
Definition: data_set.cpp:1207
Index get_variable_index(const string &name) const
Definition: data_set.cpp:2937
Tensor< type, 2 > get_selection_target_data() const
Definition: data_set.cpp:4052
void save_data() const
Saves to the data file the values of the data matrix.
Definition: data_set.cpp:7844
void save(const string &) const
Definition: data_set.cpp:7681
Tensor< type, 2 > get_testing_data() const
Definition: data_set.cpp:3938
Tensor< Descriptives, 1 > calculate_used_variables_descriptives() const
Definition: data_set.cpp:5514
MissingValuesMethod missing_values_method
Missing values method.
Definition: data_set.h:820
Tensor< type, 2 > get_training_data() const
Definition: data_set.cpp:3900
Tensor< bool, 1 > get_input_columns_binary() const
Returns the input columns of the data set.
Definition: data_set.cpp:2739
Tensor< type, 2 > get_testing_input_data() const
Definition: data_set.cpp:4066
void transform_time_series()
Arranges an input-target DataSet from a time series matrix, according to the number of lags.
Definition: data_set.cpp:8007
VariableUse get_variable_use(const Index &) const
Definition: data_set.cpp:1992
Tensor< type, 1 > calculate_variables_means(const Tensor< Index, 1 > &) const
Definition: data_set.cpp:5809
Tensor< Descriptives, 1 > calculate_columns_descriptives_selection_samples() const
Definition: data_set.cpp:5712
string get_separator_string() const
Returns the string which will be used as separator in the data file.
Definition: data_set.cpp:3794
Separator
Enumeration of available separators for the data file.
Definition: data_set.h:81
Index get_input_variables_number() const
Definition: data_set.cpp:2859
Tensor< Index, 1 > get_unused_columns_indices() const
Returns a indices vector with the positions of the unused columns.
Definition: data_set.cpp:2359
Tensor< type, 2 > read_input_csv(const string &, const char &, const string &, const bool &, const bool &) const
This method loads data from a file and returns a matrix containing the input columns.
Definition: data_set.cpp:8182
Index get_target_columns_number() const
Returns the number of columns whose uses are Target.
Definition: data_set.cpp:2609
Tensor< type, 2 > time_series_data
Definition: data_set.h:810
Tensor< Correlation, 2 > calculate_input_columns_correlations() const
Definition: data_set.cpp:6019
void generate_random_data(const Index &, const Index &)
Definition: data_set.cpp:9574
void scrub_missing_values()
Definition: data_set.cpp:9828
Index get_testing_samples_number() const
Returns the number of samples in the data set which will be used for testing.
Definition: data_set.cpp:1422
const string & get_time_column() const
Returns the indices of the time variables in the data set.
Definition: data_set.cpp:3841
Index calculate_selection_negatives(const Index &) const
Definition: data_set.cpp:5433
Index get_used_columns_number() const
Returns the number of columns that are used.
Definition: data_set.cpp:2679
bool has_nan_row(const Index &) const
Returns true if the given row contains missing values.
Definition: data_set.cpp:5922
Index get_time_series_variables_number() const
Returns the number of variables in the time series data.
Definition: data_set.cpp:2835
void print_missing_values_information() const
Definition: data_set.cpp:5940
void set_default_columns_names()
Definition: data_set.cpp:1968
Index calculate_used_negatives(const Index &) const
Definition: data_set.cpp:5363
Index get_used_samples_number() const
Definition: data_set.cpp:1443
void save_time_series_data_binary(const string &) const
Saves to the data file the values of the time series data matrix in binary format.
Definition: data_set.cpp:7958
string time_column
Index where time variable is located for forecasting applications.
Definition: data_set.h:796
Tensor< Index, 1 > get_used_columns_indices() const
Returns a indices vector with the positions of the used columns.
Definition: data_set.cpp:2383
Tensor< string, 1 > get_variables_names() const
Definition: data_set.cpp:2118
Tensor< type, 2 > get_selection_input_data() const
Definition: data_set.cpp:4038
Tensor< type, 2 > get_time_series_column_data(const Index &) const
Definition: data_set.cpp:4309
Index get_unused_columns_number() const
Returns the number of columns that are not used.
Definition: data_set.cpp:2661
const Index & get_steps_ahead() const
Returns the number of steps ahead to be used in a time series prediction application.
Definition: data_set.cpp:3833
bool has_rows_labels
Header which contains the rows label.
Definition: data_set.h:788
Tensor< Correlation, 2 > calculate_input_target_columns_correlations() const
Definition: data_set.cpp:5876
Index get_gmt() const
Definition: data_set.cpp:5856
void set_default_columns_scalers()
Definition: data_set.cpp:6128
void unscale_target_variables(const Tensor< Descriptives, 1 > &)
Definition: data_set.cpp:6397
Tensor< VariableUse, 1 > get_variables_uses() const
Definition: data_set.cpp:2026
static Scaler get_scaling_unscaling_method(const string &)
Definition: data_set.cpp:3861
void set_data_binary_random()
Definition: data_set.cpp:6458
void set_time_column(const string &)
Definition: data_set.cpp:5041
Tensor< Index, 1 > get_variable_indices(const Index &) const
Definition: data_set.cpp:4248
Tensor< type, 2 > data
Definition: data_set.h:754
Tensor< type, 1 > get_variable_data(const Index &) const
Definition: data_set.cpp:4353
Tensor< type, 2 > get_selection_data() const
Definition: data_set.cpp:3921
Tensor< type, 2 > get_column_data(const Index &) const
Definition: data_set.cpp:4289
Tensor< string, 1 > get_input_columns_names() const
Returns a string vector that contains the names of the columns whose uses are Input.
Definition: data_set.cpp:2505
void set_data_file_name(const string &)
Definition: data_set.cpp:4858
void print() const
Prints to the screen in text format the main numbers from the data set object.
Definition: data_set.cpp:7664
const bool & get_header_line() const
Returns true if the first line of the data file has a header with the names of the variables,...
Definition: data_set.cpp:3712
Index missing_values_number
Missing values.
Definition: data_set.h:824
Tensor< type, 2 > get_input_data() const
Definition: data_set.cpp:3955
void set_column_name(const Index &, const string &)
Definition: data_set.cpp:1983
void set_display(const bool &)
Definition: data_set.cpp:4785
Tensor< string, 1 > get_target_variables_names() const
Definition: data_set.cpp:2215
Tensor< Index, 1 > get_unused_samples_indices() const
Returns the indices of the samples set unused.
Definition: data_set.cpp:1173
void set_input_columns_unused()
Sets all input columns in the data_set as unused columns.
Definition: data_set.cpp:3229
void print_data_preview() const
Definition: data_set.cpp:7792
Tensor< type, 2 > get_training_input_data() const
Definition: data_set.cpp:4010
Tensor< Index, 1 > get_input_variables_indices() const
Returns the indices of the input variables.
Definition: data_set.cpp:3047
Tensor< string, 1 > get_input_variables_names() const
Definition: data_set.cpp:2184
void set_columns_unused()
Sets all columns in the data_set as unused columns.
Definition: data_set.cpp:3200
Tensor< Index, 1 > filter_data(const Tensor< type, 1 > &, const Tensor< type, 1 > &)
Definition: data_set.cpp:9662
void write_XML(tinyxml2::XMLPrinter &) const
Serializes the data set object into a XML document of the TinyXML library without keep the DOM tree i...
Definition: data_set.cpp:6486
Tensor< Descriptives, 1 > calculate_columns_descriptives_negative_samples() const
Calculate the descriptives of the samples with neagtive targets in binary classification problems.
Definition: data_set.cpp:5584
Tensor< type, 1 > get_samples_uses_percentages() const
Definition: data_set.cpp:977
void set_data(const Tensor< type, 2 > &)
Definition: data_set.cpp:4831
Tensor< BoxPlot, 1 > calculate_columns_box_plots() const
Definition: data_set.cpp:5320
void generate_sequential_data(const Index &, const Index &)
Definition: data_set.cpp:9587
Tensor< Index, 2 > get_batches(const Tensor< Index, 1 > &, const Index &, const bool &, const Index &buffer_size=100) const
Definition: data_set.cpp:1217
Separator separator
Separator character.
Definition: data_set.h:776
Index get_column_index(const string &) const
Definition: data_set.cpp:4192
void set_columns_names(const Tensor< string, 1 > &)
Definition: data_set.cpp:3403
Tensor< Column, 1 > get_input_columns() const
Returns the input columns of the data set.
Definition: data_set.cpp:2717
Tensor< string, 1 > unuse_constant_columns()
Definition: data_set.cpp:5073
bool is_sample_unused(const Index &) const
Definition: data_set.cpp:926
void set_column_use(const Index &, const VariableUse &)
Definition: data_set.cpp:3255
MissingValuesMethod get_missing_values_method() const
Returns a string with the method used.
Definition: data_set.cpp:3696
Tensor< Descriptives, 1 > calculate_columns_descriptives_categories(const Index &) const
Definition: data_set.cpp:5645
Index get_columns_number() const
Returns the number of columns in the data set.
Definition: data_set.cpp:2800
Tensor< Index, 1 > get_unused_variables_indices() const
Returns the indices of the unused variables.
Definition: data_set.cpp:2956
bool has_nan() const
Returns true if the data contain missing values.
Definition: data_set.cpp:5912
const string & get_data_file_name() const
Returns the name of the data file.
Definition: data_set.cpp:3704
void set_steps_ahead_number(const Index &)
Definition: data_set.cpp:5032
Tensor< Column, 1 > get_used_columns() const
Returns the used columns of the data set.
Definition: data_set.cpp:2781
void set_default_columns_uses()
Definition: data_set.cpp:1921
const Tensor< type, 2 > & get_time_series_data() const
Definition: data_set.cpp:3688
void print_input_target_columns_correlations() const
Print on screen the correlation between targets and inputs.
Definition: data_set.cpp:5960
Tensor< type, 2 > get_testing_target_data() const
Definition: data_set.cpp:4080
Tensor< Descriptives, 1 > scale_input_variables()
Definition: data_set.cpp:6243
Tensor< Index, 1 > get_used_variables_indices() const
Returns the indices of the used variables.
Definition: data_set.cpp:3002
void unuse_Tukey_outliers(const type &=type(1.5))
Definition: data_set.cpp:8624
Tensor< Column, 1 > get_columns() const
Returns the columns of the data set.
Definition: data_set.cpp:2697
Tensor< type, 1 > calculate_target_variables_maximums() const
Returns a vector containing the maximums of the target variables.
Definition: data_set.cpp:5792
Index get_input_columns_number() const
Returns the number of columns whose uses are Input.
Definition: data_set.cpp:2575
Tensor< type, 2 > get_training_target_data() const
Definition: data_set.cpp:4024
Index get_used_variables_number() const
Returns the number of variables which are either input nor target.
Definition: data_set.cpp:2259
Tensor< Column, 1 > get_target_columns() const
Returns the target columns of the data set.
Definition: data_set.cpp:2759
void set_columns_uses(const Tensor< string, 1 > &)
Definition: data_set.cpp:3142
Extensions to the C++ standard library.
Definition: half.hpp:2325
This structure represents the columns of the DataSet.
Definition: data_set.h:111
Index get_categories_number() const
Returns the number of categories.
Definition: data_set.cpp:735
virtual ~Column()
Destructor.
Definition: data_set.cpp:133
void set_categories_uses(const Tensor< string, 1 > &)
Definition: data_set.cpp:294
Tensor< string, 1 > categories
Categories within the column.
Definition: data_set.h:143
ColumnType type
Column type.
Definition: data_set.h:139
void add_category(const string &)
Definition: data_set.cpp:270
string name
Column name.
Definition: data_set.h:131
VariableUse column_use
Column use.
Definition: data_set.h:135
Tensor< VariableUse, 1 > categories_uses
Categories use.
Definition: data_set.h:147
void set_use(const VariableUse &)
Definition: data_set.cpp:182
void set_type(const string &)
Definition: data_set.cpp:230
Tensor< string, 1 > get_used_variables_names() const
Returns a string vector that contains the names of the used variables in the data set.
Definition: data_set.cpp:758
Index get_used_categories_number() const
Returns the number of used categories.
Definition: data_set.cpp:743
Column()
Default constructor.
Definition: data_set.cpp:102
DataSetBatch()
Default constructor.
Definition: data_set.h:890
virtual ~DataSetBatch()
Destructor.
Definition: data_set.h:896