OpenNN
Open-source neural networks library
Loading...
Searching...
No Matches
dataset.h
Go to the documentation of this file.
1// OpenNN: Open Neural Networks Library
2// www.opennn.net
3//
4// D A T A S E T C L A S S H E A D E R
5//
6// Artificial Intelligence Techniques SL
7// artelnics@artelnics.com
8
17
18#pragma once
19
20#include "correlations.h"
21#include "statistics.h"
22#include "tensor_utilities.h"
23#include "enum_map.h"
24#include "string_utilities.h"
25#include "variable.h"
26
27namespace opennn
28{
29
46
52{
53 static const vector<pair<SampleRole, string>> entries = {
54 {SampleRole::Training, "Training"},
55 {SampleRole::Validation, "Validation"},
56 {SampleRole::Testing, "Testing"},
57 {SampleRole::None, "None"}
58 };
59 static const EnumMap<SampleRole> map{entries};
60 return map;
61}
62
68inline const string& sample_role_to_string(SampleRole role)
69{
70 return sample_role_map().to_string(role);
71}
72
81inline SampleRole string_to_sample_role(const string& name)
82{
83 if (name == "0") return SampleRole::Training;
84 if (name == "1") return SampleRole::Validation;
85 if (name == "2") return SampleRole::Testing;
86 if (name == "3") return SampleRole::None;
87 return sample_role_map().from_string(name);
88}
89
108{
109
110public:
111
116 enum class Codification { UTF8, SHIFT_JIS };
117
124 Dataset(const Index samples_number = 0,
125 const Shape& input_shape = {0},
126 const Shape& target_shape = {0});
127
136 Dataset(const filesystem::path& data_path,
137 const string& separator,
138 bool has_header = true,
139 bool has_ids = false,
141
142 // Enumerations
143
149
155
160 Index get_samples_number() const {return data.rows();}
161
167 Index get_samples_number(const string& role_name) const;
168
174
180 vector<Index> get_sample_indices(const string& role_name) const;
181
186 vector<Index> get_used_sample_indices() const;
187
192 const vector<SampleRole>& get_sample_roles() const { return sample_roles; }
193
198 vector<Index> get_sample_roles_vector() const;
199
205
210 Index get_variables_number() const { return variables.size(); }
211
217 Index get_variables_number(const string& role_name) const;
218
224
229 const vector<Variable>& get_variables() const { return variables; }
230
236 vector<Variable> get_variables(const string& role_name) const;
237
243 Index get_variable_index(const string& name) const;
244
250 Index get_variable_index(const Index id) const;
251
257 vector<Index> get_variable_indices(const string& role_name) const;
258
263 vector<Index> get_used_variables_indices() const;
264
269 vector<string> get_variable_names() const;
270
276 vector<string> get_variable_names(const string& role_name) const;
277
283 VariableType get_variable_type(const Index index) const { return variables[index].type; }
284
290 vector<VariableType> get_variable_types(const vector<Index> indices) const;
291
300 Index get_features_number() const;
301
307 Index get_features_number(const string& role_name) const;
308
314
319 vector<string> get_feature_names() const;
320
326 vector<string> get_feature_names(const string& role_name) const;
327
332 vector<vector<Index>> get_feature_indices() const;
333
339 vector<Index> get_feature_indices(const Index variable_index) const;
340
346 vector<Index> get_feature_indices(const string& role_name) const;
347
352 vector<Index> get_used_feature_indices() const;
353
358 vector<Index> get_feature_dimensions() const;
359
365 Shape get_shape(const string& role_name) const;
366
372 vector<string> get_feature_scalers(const string& role_name) const;
373
384 virtual void get_batches(const vector<Index>& sample_indices, Index batch_size, bool shuffle, vector<vector<Index>>& batches) const;
385
390 const MatrixR& get_data() const { return data; }
391
397 MatrixR get_feature_data(const string& role_name) const;
398
405 MatrixR get_data(const string& sample_role, const string& variable_role) const;
406
413 MatrixR get_data_from_indices(const vector<Index>& sample_indices, const vector<Index>& variable_indices) const;
414
420 VectorR get_sample_data(const Index sample_index) const;
421
427 MatrixR get_variable_data(const Index variable_index) const;
428
435 MatrixR get_variable_data(const Index variable_index, const vector<Index>& sample_indices) const;
436
442 MatrixR get_variable_data(const string& variable_name) const;
443
448 const vector<vector<string>>& get_data_file_preview() const { return data_file_preview; }
449
455
461
466 const filesystem::path& get_data_path() const { return data_path; }
467
472 const Separator& get_separator() const { return separator; }
473
478 string get_separator_string() const;
479
484 string get_separator_name() const;
485
490 const Codification& get_codification() const { return codification; }
491
496 const string get_codification_string() const;
497
502 const string& get_missing_values_label() const { return missing_values_label; }
503
508 bool get_display() const { return display; }
509
514 bool is_empty() const { return data.size() == 0; }
515
521
527
534 void set(const Index samples_number = 0, const Shape& input_shape = {}, const Shape& target_shape = {});
535
544 void set(const filesystem::path& data_path,
545 const string& separator,
546 bool has_header = true,
547 bool has_ids = false,
549
554 void set(const filesystem::path& file_name);
555
560
565 void set_sample_roles(const string& role_name);
566
572 void set_sample_role(const Index sample_index, const string& role_name);
573
578 void set_sample_roles(const vector<string>& role_names);
579
585 void set_sample_roles(const vector<Index>& sample_indices, const string& role_name);
586
591 void set_variables(const vector<Variable>& new_variables) { variables = new_variables; }
592
597
602 virtual void set_variable_roles(const vector<string>& role_names);
603
608 void set_variables(const string& description);
609
615 void set_variable_indices(const vector<Index>& input_indices, const vector<Index>& target_indices);
616
621
627 void set_variable_role(const Index variable_index, const string& role_name);
628
634 void set_variable_role(const string& variable_name, const string& role_name);
635
641 void set_variable_type(const Index variable_index, const VariableType& type);
642
648 void set_variable_type(const string& variable_name, const VariableType& type);
649
655
660 void set_variable_names(const vector<string>& new_variable_names);
661
666 void set_variables_number(const Index new_size) { variables.resize(new_size); }
667
672 void set_variable_scalers(const string& scaler_name);
673
678 void set_variable_scalers(const vector<string>& scaler_names);
679
684
692 void set_feature_names(const vector<string>& new_feature_names);
693
698 void set_variable_roles(const string& role_name);
699
705 void set_shape(const string& role_name, const Shape& new_shape);
706
715 void set_data(const MatrixR& new_data);
716
721 void set_data_path(const filesystem::path& new_data_path) { data_path = new_data_path; }
722
727 void set_has_header(bool new_has_header) { has_header = new_has_header; }
728
733 void set_has_ids(bool new_has_ids) { has_sample_ids = new_has_ids; }
734
739 void set_separator(const Separator& new_separator) { separator = new_separator; }
740
745 void set_separator_string(const string& new_separator_string);
746
751 void set_separator_name(const string& new_separator_name);
752
757 void set_codification(const Codification& new_codification) { codification = new_codification; }
758
763 void set_codification(const string& new_codification);
764
769 void set_missing_values_label(string label) { missing_values_label = move(label); }
770
776
781 void set_missing_values_method(const string& method_name);
782
787 void set_gmt(const Index new_gmt) { gmt = new_gmt; }
788
793 void set_display(bool new_display) { display = new_display; }
794
800 bool is_sample_used(const Index i) const { return sample_roles[i] != SampleRole::None; }
801
807
813
819
824 bool has_time_variable() const;
825
830 bool has_validation() const;
831
837 bool has_missing_values(const vector<string>& labels) const;
838
850 void split_samples(const float training_ratio = 0.6f, float selection_ratio = 0.2f, float testing_ratio = 0.2f, bool shuffle = true);
851
858 void split_samples_sequential(const float training_ratio = 0.6f, float selection_ratio = 0.2f, float testing_ratio = 0.2f);
859
866 void split_samples_random(const float training_ratio = 0.6f, float selection_ratio = 0.2f, float testing_ratio = 0.2f);
867
873 vector<string> unuse_uncorrelated_variables(const float minimum_correlation = 0.25f);
874
880 vector<string> unuse_collinear_variables(const float maximum_correlation = 0.95f);
881
886 void set_data_constant(const float value);
887
892 vector<Descriptives> calculate_feature_descriptives() const;
893
899
905
911 vector<Descriptives> calculate_variable_descriptives_categories(const Index variable_index) const;
912
918 vector<Descriptives> calculate_feature_descriptives(const string& role_name) const;
919
925 vector<Histogram> calculate_variable_distributions(const Index bins_number = 10) const;
926
931 vector<BoxPlot> calculate_variables_box_plots() const;
932
941 Correlation (*correlation_function)(const MatrixR&, const MatrixR&), Correlation::Method method, const string& samples_role) const;
942
947 Tensor<Correlation, 2> calculate_input_variable_pearson_correlations() const;
948
954
962 Correlation (*correlation_function)(const MatrixR&, const MatrixR&), const string& samples_role) const;
963
969
975
981
986
991 vector<Descriptives> scale_data();
992
1004 virtual vector<Descriptives> scale_features(const string& role_name);
1005
1011 void unscale_features(const string& role_name, const vector<Descriptives>& feature_descriptives);
1012
1018
1025 vector<vector<Index>> calculate_Tukey_outliers(const float tukey_factor = 1.5f, bool replace = false);
1026
1032 vector<vector<Index>> replace_Tukey_outliers_with_NaN(const float tukey_factor = 1.5f);
1033
1038 void unuse_Tukey_outliers(const float tukey_factor = 1.5f);
1039
1043 virtual void set_data_random();
1044
1049 virtual void set_data_integer(const Index vocabulary_size);
1050
1055
1060
1065 virtual void from_JSON(const JsonDocument& document);
1066
1071 virtual void to_JSON(JsonWriter& writer) const;
1072
1077 void save(const filesystem::path& file_name) const;
1078
1083 void load(const filesystem::path& file_name);
1084
1088 void save_data() const;
1089
1094 void save_data_binary(const filesystem::path& file_name) const;
1095
1100
1106
1111 bool has_nan() const;
1112
1118 bool has_nan_row(const Index row_index) const;
1119
1124
1130
1135
1140
1145
1151
1157
1162 Index count_rows_with_nan() const;
1163
1168 Index count_nan() const;
1169
1176 vector<vector<Index>> split_samples(const vector<Index>& indices, Index parts_number) const;
1177
1181 virtual void read_csv();
1182
1191 DateFormat infer_dataset_date_format(const vector<Variable>& variables, const vector<vector<string>>& data_file_preview, bool has_header, const string& missing_values_label);
1192
1205 virtual void fill_inputs(const vector<Index>& sample_indices, const vector<Index>& feature_indices, float* buffer, bool transpose = true, int contiguous = -1) const;
1206
1215 virtual void augment_inputs(float* buffer, Index batch_size) const {}
1216
1225 virtual void fill_decoder(const vector<Index>& sample_indices,
1226 const vector<Index>& feature_indices,
1227 float* buffer,
1228 bool transpose = true,
1229 int contiguous = -1) const;
1230
1239 virtual void fill_targets(const vector<Index>& sample_indices, const vector<Index>& feature_indices, float* buffer, bool transpose = true, int contiguous = -1) const;
1240
1241private:
1242
1244 void infer_variable_types_from_data();
1245
1247 void unuse_constant_variables();
1248
1254 string get_sample_role(const Index i) const { return sample_role_to_string(sample_roles[i]); }
1255
1261 SampleRole get_sample_role_type(const Index i) const { return sample_roles[i]; }
1262
1269 vector<Index> filter_used_samples_by_column(Index, bool) const;
1270
1278 void apply_scaler(Index feature_index, const string& scaler, const Descriptives& descriptives, bool inverse);
1279
1284 void infer_column_types(const vector<vector<string>>& data_file_preview);
1285
1290 void read_data_file_preview(const vector<vector<string>>& data_file_preview);
1291
1297 void check_separators(const string& file_path) const;
1298
1299protected:
1300
1303
1306
1307 // DATA
1308
1311
1312 // Dimensions
1313
1320
1321 // Samples
1322
1324 vector<SampleRole> sample_roles;
1325
1327 vector<string> sample_ids;
1328
1329 // Variables
1330
1332 vector<Variable> variables;
1333
1334 // Data File
1335
1337 filesystem::path data_path;
1338
1341
1344
1345 //VectorB nans_variables;
1346
1348 bool has_header = false;
1349
1351 bool has_sample_ids = false;
1352
1355
1357 vector<vector<string>> data_file_preview;
1358
1360 Index gmt = 0;
1361
1362 // Missing Values
1363
1366
1369
1372
1375
1376 // Display
1377
1379 bool display = true;
1380
1382 const vector<string> positive_words = {"1", "yes", "positive", "+", "true", "good", "si", "sí", "Sí"};
1384 const vector<string> negative_words = {"0", "no", "negative", "-", "false", "bad", "not", "No"};
1385
1394
1403};
1404
1405}
1406
1407#define STRINGIFY_ENUM(x) #x
1408
1409#define ENUM_TO_STRING(x) STRINGIFY_ENUM(x)
1410
1411// OpenNN: Open Neural Networks Library.
1412// Copyright(C) 2005-2026 Artificial Intelligence Techniques, SL.
1413// Licensed under the GNU Lesser General Public License v2.1 or later.
DateFormat infer_dataset_date_format(const vector< Variable > &variables, const vector< vector< string > > &data_file_preview, bool has_header, const string &missing_values_label)
Infers the date format used by date-typed variables in the source file.
virtual void read_csv()
Reads the configured CSV file into the data matrix.
MissingValuesMethod
Strategy for replacing missing values.
Definition dataset.h:154
@ Unuse
Definition dataset.h:154
@ Mean
Definition dataset.h:154
@ Interpolation
Definition dataset.h:154
@ Median
Definition dataset.h:154
virtual void fill_targets(const vector< Index > &sample_indices, const vector< Index > &feature_indices, float *buffer, bool transpose=true, int contiguous=-1) const
Fills a contiguous buffer with target data for a batch of samples.
void set(const filesystem::path &file_name)
Resets the dataset by loading a previously serialized JSON state.
void split_samples_sequential(const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
Splits samples into partitions in their original order.
void set_variable_role(const Index variable_index, const string &role_name)
Sets the role of a single variable by index.
vector< string > get_feature_names() const
Returns the names of every feature.
Index count_nan() const
Counts the total number of NaN cells.
Shape get_shape(const string &role_name) const
Returns the input or target shape used by the network.
vector< Descriptives > scale_data()
Scales the entire data matrix.
Tensor< Correlation, 2 > calculate_input_variable_pearson_correlations() const
Computes Pearson correlations between every pair of input variables.
Tensor< Correlation, 2 > calculate_input_target_variable_correlations(Correlation(*correlation_function)(const MatrixR &, const MatrixR &), const string &samples_role) const
Computes a custom correlation between inputs and targets.
Index get_samples_number(const string &role_name) const
Returns the number of samples assigned to a given role.
void preview_data_to_JSON(JsonWriter &) const
Serializes the source-file preview to JSON.
void set_input_variables_unused()
Marks all input variables as Unused.
VectorI calculate_target_distribution() const
Counts the samples of every target class.
void set_variables_number(const Index new_size)
Resizes the variables vector.
Definition dataset.h:666
void set_variable_scalers(const string &scaler_name)
Sets the same scaler on every variable.
vector< Index > get_used_variables_indices() const
Returns the column indices of all variables that are not "Unused".
virtual vector< Descriptives > scale_features(const string &role_name)
Scales the features of a given role.
Tensor< Correlation, 2 > calculate_input_target_variable_spearman_correlations() const
Computes Spearman rank correlations between inputs and targets.
void split_samples_random(const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
Splits samples into partitions after random shuffling.
void set_separator(const Separator &new_separator)
Sets the field separator.
Definition dataset.h:739
vector< Index > get_sample_roles_vector() const
Returns the per-sample role indices as plain integers.
MatrixR get_data(const string &sample_role, const string &variable_role) const
Returns the data restricted to a sample-role and variable-role intersection.
Codification codification
Source-file character encoding.
Definition dataset.h:1354
void set_gmt(const Index new_gmt)
Sets the GMT offset for time variables.
Definition dataset.h:787
void set_display(bool new_display)
Toggles progress messages.
Definition dataset.h:793
string get_missing_values_method_string() const
Returns the missing-value strategy as a string.
bool has_header
Whether the source file's first row contains column names.
Definition dataset.h:1348
Index get_missing_values_number() const
Returns the total number of cells flagged as missing.
Definition dataset.h:1105
string get_separator_name() const
Returns the field separator as a human-readable name.
MatrixR data
Dense data matrix [samples x variables].
Definition dataset.h:1310
virtual void impute_missing_values_unuse()
Marks samples with missing values as Unused.
void set_variables(const string &description)
Re-creates the variables vector from an input/target shape descriptor.
vector< Index > get_variable_indices(const string &role_name) const
Returns the column indices of the variables assigned to a given role.
Shape target_shape
Shape of the target portion.
Definition dataset.h:1317
vector< vector< Index > > calculate_Tukey_outliers(const float tukey_factor=1.5f, bool replace=false)
Detects Tukey outliers per variable.
virtual void from_JSON(const JsonDocument &document)
Restores the dataset state from a JSON document.
void set_data(const MatrixR &new_data)
Replaces the data matrix.
void set_codification(const string &new_codification)
Sets the source-file codification from its name.
bool has_sample_ids
Whether the source file's first column contains sample identifiers.
Definition dataset.h:1351
vector< vector< Index > > split_samples(const vector< Index > &indices, Index parts_number) const
Splits a list of sample indices into chunks of (roughly) equal size.
void set_codification(const Codification &new_codification)
Sets the source-file codification.
Definition dataset.h:757
virtual void to_JSON(JsonWriter &writer) const
Serializes the dataset state to JSON.
bool is_empty() const
Reports whether the data matrix is empty.
Definition dataset.h:514
Tensor< Correlation, 2 > calculate_input_variable_correlations(Correlation(*correlation_function)(const MatrixR &, const MatrixR &), Correlation::Method method, const string &samples_role) const
Computes a custom correlation between every pair of input variables.
Shape get_target_shape() const
Returns the target shape.
Definition dataset.h:526
vector< Descriptives > calculate_variable_descriptives_categories(const Index variable_index) const
Computes descriptives per category of a categorical variable.
Shape decoder_shape
Shape of the decoder portion (transformer-style models).
Definition dataset.h:1319
void scrub_missing_values()
Removes samples that contain missing values.
const vector< SampleRole > & get_sample_roles() const
Returns the per-sample role assignments.
Definition dataset.h:192
void set_variable_type(const string &variable_name, const VariableType &type)
Sets the type of a single variable by name.
vector< Descriptives > calculate_variable_descriptives_positive_samples() const
Computes descriptives for inputs restricted to positive-target samples.
Dataset(const filesystem::path &data_path, const string &separator, bool has_header=true, bool has_ids=false, const Codification &codification=Codification::UTF8)
Constructs a dataset by loading a delimited text file.
void save_data_binary(const filesystem::path &file_name) const
Saves the data matrix as a binary file.
void set_missing_values_method(const string &method_name)
Sets the missing-value handling strategy from its name.
void unuse_Tukey_outliers(const float tukey_factor=1.5f)
Marks Tukey-outlier samples as Unused.
void impute_missing_values_statistic(const MissingValuesMethod &method)
Replaces missing values with a per-variable statistic.
const vector< string > negative_words
Strings interpreted as negative when parsing binary variables.
Definition dataset.h:1384
virtual void impute_missing_values_interpolate()
Replaces missing values via linear interpolation along each variable.
virtual void set_variable_roles(const vector< string > &role_names)
Assigns roles to all variables from a parallel string vector.
void set_sample_roles(const vector< Index > &sample_indices, const string &role_name)
Assigns the same role to a list of samples.
void set_default()
Resets configuration members to defaults.
vector< string > sample_ids
Optional per-sample identifiers (when the source file has an id column).
Definition dataset.h:1327
void set_separator_string(const string &new_separator_string)
Sets the field separator from its delimiter character(s).
string missing_values_label
Label that marks missing values in the source file.
Definition dataset.h:1343
vector< SampleRole > sample_roles
Per-sample role (Training/Validation/Testing/None).
Definition dataset.h:1324
MatrixR get_variable_data(const string &variable_name) const
Returns the data for a single variable identified by name.
void set_sample_roles(const string &role_name)
Assigns the same role to every sample.
void load_data_binary()
Loads the data matrix from a binary file produced by save_data_binary().
Index get_variables_number() const
Returns the total number of variables (columns of data).
Definition dataset.h:210
void save_data() const
Saves the data matrix back to the configured source file.
vector< string > get_variable_names() const
Returns the names of every variable.
void set_binary_variables()
Detects binary variables (two distinct values) and tags them accordingly.
void set_default_variable_roles_forecasting()
Sets default roles for forecasting (typical pattern: lagged inputs + future target).
vector< Descriptives > calculate_feature_descriptives(const string &role_name) const
Computes feature descriptives for a single role.
const MatrixR & get_data() const
Returns the raw data matrix.
Definition dataset.h:390
virtual void set_data_random()
Fills the data matrix with uniform random values.
void save(const filesystem::path &file_name) const
Saves the dataset state to a JSON file on disk.
vector< string > get_variable_names(const string &role_name) const
Returns the names of the variables assigned to a given role.
vector< string > get_feature_names(const string &role_name) const
Returns the names of the features assigned to a given role.
vector< vector< Index > > get_feature_indices() const
Returns the per-variable feature indices.
VectorI get_sample_role_numbers() const
Counts the samples assigned to each role.
Codification
Source-file character encoding.
Definition dataset.h:116
@ SHIFT_JIS
Definition dataset.h:116
@ UTF8
Definition dataset.h:116
Index get_used_samples_number() const
Returns the number of samples that are not "None".
vector< Index > get_feature_dimensions() const
Returns the per-variable feature dimension (1 for Numeric, N for Categorical).
bool has_binary_variables() const
Reports whether at least one variable is binary.
const string get_codification_string() const
Returns the codification as a string.
void set_data_path(const filesystem::path &new_data_path)
Sets the path to the source data file.
Definition dataset.h:721
void variables_from_JSON(const Json *)
Restores the variables vector from JSON.
virtual void set_data_integer(const Index vocabulary_size)
Fills the data matrix with random integers in [0, vocabulary_size).
vector< Histogram > calculate_variable_distributions(const Index bins_number=10) const
Builds histograms of every variable.
VectorI calculate_correlations_rank() const
Returns the rank of every input variable by absolute Pearson correlation against targets.
MissingValuesMethod get_missing_values_method() const
Returns the configured missing-value strategy.
Definition dataset.h:454
Index missing_values_number
Total number of missing cells.
Definition dataset.h:1368
void set_default_variable_roles()
Sets default Input/Target roles based on column position (last column = target).
vector< Variable > get_variables(const string &role_name) const
Returns the variables assigned to a given role.
const vector< vector< string > > & get_data_file_preview() const
Returns the cached preview of the source file (first rows).
Definition dataset.h:448
void set_has_ids(bool new_has_ids)
Sets whether the source file has a sample-id column.
Definition dataset.h:733
Index count_variables_with_nan() const
Counts variables that contain at least one NaN.
Index count_rows_with_nan() const
Counts samples that contain at least one NaN.
Separator separator
Field separator used in the source file.
Definition dataset.h:1340
Tensor< Correlation, 2 > calculate_input_variable_spearman_correlations() const
Computes Spearman rank correlations between every pair of input variables.
void samples_from_JSON(const Json *)
Restores the per-sample roles from JSON.
Shape get_input_shape() const
Returns the input shape.
Definition dataset.h:520
void missing_values_from_JSON(const Json *)
Restores the missing-value statistics from JSON.
vector< Variable > variables
Per-variable metadata (name, role, type, scaler).
Definition dataset.h:1332
void set(const Index samples_number=0, const Shape &input_shape={}, const Shape &target_shape={})
Resets the dataset to a synthetic shape.
vector< Index > get_feature_indices(const string &role_name) const
Returns the feature indices for variables of a given role.
virtual void get_batches(const vector< Index > &sample_indices, Index batch_size, bool shuffle, vector< vector< Index > > &batches) const
Splits a list of sample indices into batches.
bool has_binary_or_categorical_variables() const
Reports whether the dataset has any binary or categorical variable.
void set_default_variable_names()
Sets default names ("variable_1", "variable_2", ...) for every variable.
vector< Index > get_used_feature_indices() const
Returns the feature indices for all variables that are not "Unused".
void set(const filesystem::path &data_path, const string &separator, bool has_header=true, bool has_ids=false, const Dataset::Codification &codification=Codification::UTF8)
Resets the dataset by loading a delimited text file.
vector< string > unuse_collinear_variables(const float maximum_correlation=0.95f)
Marks variables strongly correlated against another input as Unused.
void set_variable_names(const vector< string > &new_variable_names)
Replaces the names of every variable.
Index get_variable_index(const string &name) const
Returns the column index of the variable with a given name.
void set_missing_values_label(string label)
Sets the label used for missing values in the source file.
Definition dataset.h:769
VectorR get_sample_data(const Index sample_index) const
Returns a single sample as a row vector.
vector< Descriptives > calculate_variable_descriptives_negative_samples() const
Computes descriptives for inputs restricted to negative-target samples.
Dataset(const Index samples_number=0, const Shape &input_shape={0}, const Shape &target_shape={0})
Constructs an empty dataset of given dimensions.
vector< vector< string > > data_file_preview
Cached preview of the first rows of the source file.
Definition dataset.h:1357
Index rows_missing_values_number
Number of rows that contain at least one missing value.
Definition dataset.h:1374
Index get_samples_number() const
Returns the total number of samples (rows of data).
Definition dataset.h:160
MissingValuesMethod missing_values_method
Strategy used to handle missing values.
Definition dataset.h:1365
vector< Index > get_feature_indices(const Index variable_index) const
Returns the feature indices for a single variable.
bool has_categorical_variables() const
Reports whether at least one variable is categorical.
void set_variable_indices(const vector< Index > &input_indices, const vector< Index > &target_indices)
Marks selected variables as Input and others as Target.
MatrixR get_data_from_indices(const vector< Index > &sample_indices, const vector< Index > &variable_indices) const
Returns the data restricted to specific samples and variables.
bool has_validation() const
Reports whether at least one sample is assigned to Validation.
const vector< string > positive_words
Strings interpreted as positive when parsing binary variables.
Definition dataset.h:1382
bool has_missing_values(const vector< string > &labels) const
Reports whether the dataset has missing values matching any of the supplied labels.
void set_variable_roles(const string &role_name)
Assigns the same role to every variable.
Tensor< Correlation, 2 > calculate_input_target_variable_pearson_correlations() const
Computes Pearson correlations between inputs and targets.
void set_missing_values_method(const MissingValuesMethod &method)
Sets the missing-value handling strategy.
Definition dataset.h:775
const Separator & get_separator() const
Returns the configured field separator.
Definition dataset.h:472
bool has_nan_row(const Index row_index) const
Reports whether a row contains any NaN.
vector< vector< Index > > replace_Tukey_outliers_with_NaN(const float tukey_factor=1.5f)
Detects Tukey outliers and replaces them with NaN.
void load(const filesystem::path &file_name)
Loads the dataset state from a JSON file on disk.
MatrixR get_variable_data(const Index variable_index, const vector< Index > &sample_indices) const
Returns the data for a single variable on a subset of samples.
Shape input_shape
Shape of the input portion (rank may exceed 1 for image/sequence data).
Definition dataset.h:1315
Index gmt
GMT offset for time variables, in hours.
Definition dataset.h:1360
const filesystem::path & get_data_path() const
Returns the path to the source data file.
Definition dataset.h:466
vector< Descriptives > calculate_feature_descriptives() const
Computes descriptive statistics for every feature.
void set_variable_role(const string &variable_name, const string &role_name)
Sets the role of a single variable by name.
void set_variables(const vector< Variable > &new_variables)
Replaces the per-variable metadata.
Definition dataset.h:591
vector< Index > get_used_sample_indices() const
Returns the indices of all samples that are not "None".
Index get_used_variables_number() const
Returns the number of variables that are not "Unused".
void set_shape(const string &role_name, const Shape &new_shape)
Sets the input or target shape.
void set_data_rosenbrock()
Fills the data matrix with samples from the Rosenbrock function.
bool has_time_variable() const
Reports whether at least one variable plays the Time role.
virtual void augment_inputs(float *buffer, Index batch_size) const
Optionally augments inputs in-place after fill_inputs() (e.g. random crops).
Definition dataset.h:1215
void samples_to_JSON(JsonWriter &) const
Serializes the per-sample roles to JSON.
virtual void fill_decoder(const vector< Index > &sample_indices, const vector< Index > &feature_indices, float *buffer, bool transpose=true, int contiguous=-1) const
Fills a contiguous buffer with decoder-side inputs (transformer-style models).
vector< BoxPlot > calculate_variables_box_plots() const
Computes box-plot statistics for every variable.
bool has_nan() const
Reports whether the data matrix contains any NaN.
void set_sample_role(const Index sample_index, const string &role_name)
Assigns a role to a single sample.
void set_variable_type(const Index variable_index, const VariableType &type)
Sets the type of a single variable by index.
vector< string > unuse_uncorrelated_variables(const float minimum_correlation=0.25f)
Marks variables with low correlation against the target as Unused.
MatrixR get_feature_data(const string &role_name) const
Returns the data matrix restricted to the features of a given role.
VectorI count_nans_per_variable() const
Counts NaN cells per variable.
void set_data_binary_classification()
Fills the data matrix with synthetic binary-classification data.
void set_separator_name(const string &new_separator_name)
Sets the field separator from its human-readable name.
void set_feature_names(const vector< string > &new_feature_names)
Names every feature.
const vector< Variable > & get_variables() const
Returns the per-variable metadata.
Definition dataset.h:229
bool get_display() const
Reports whether progress messages are printed.
Definition dataset.h:508
filesystem::path data_path
Path to the source data file.
Definition dataset.h:1337
Separator
Field-separator type for tabular files.
Definition dataset.h:148
@ Comma
Definition dataset.h:148
@ Tab
Definition dataset.h:148
@ Semicolon
Definition dataset.h:148
@ Space
Definition dataset.h:148
const Codification & get_codification() const
Returns the configured source-file codification.
Definition dataset.h:490
string get_separator_string() const
Returns the field separator as the actual delimiter character(s).
void set_variable_scalers(const vector< string > &scaler_names)
Sets one scaler per variable.
void split_samples(const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f, bool shuffle=true)
Splits samples into training/validation/testing partitions.
vector< VariableType > get_variable_types(const vector< Index > indices) const
Returns the types of a list of variables.
VariableType get_variable_type(const Index index) const
Returns the type of a variable (Numeric, Binary, Categorical, ...).
Definition dataset.h:283
vector< Index > get_sample_indices(const string &role_name) const
Returns the indices of the samples assigned to a given role.
bool is_sample_used(const Index i) const
Reports whether a sample is used (any role other than None).
Definition dataset.h:800
VectorI variables_missing_values_number
Per-variable missing-value count.
Definition dataset.h:1371
void calculate_missing_values_statistics()
Updates the cached missing-value statistics (counts, indices).
void preview_data_from_JSON(const Json *)
Restores the source-file preview from JSON.
Index get_used_features_number() const
Returns the number of features that are not "Unused".
bool display
Whether to print progress messages.
Definition dataset.h:1379
Index get_variable_index(const Index id) const
Returns the column index of the variable with a given numeric id.
void set_default_variable_scalers()
Picks default scalers for every variable based on its type.
const string & get_missing_values_label() const
Returns the label that marks missing values in the source file.
Definition dataset.h:502
virtual void fill_inputs(const vector< Index > &sample_indices, const vector< Index > &feature_indices, float *buffer, bool transpose=true, int contiguous=-1) const
Fills a contiguous float buffer with input data for a batch of samples.
void set_has_header(bool new_has_header)
Sets whether the source file has a header row.
Definition dataset.h:727
void unscale_features(const string &role_name, const vector< Descriptives > &feature_descriptives)
Inverse-scales the features of a given role.
Index get_variables_number(const string &role_name) const
Returns the number of variables assigned to a given role.
Index get_features_number(const string &role_name) const
Returns the number of features assigned to a given role.
MatrixR get_variable_data(const Index variable_index) const
Returns the data for a single variable across all samples.
void variables_to_JSON(JsonWriter &) const
Serializes the variables vector to JSON.
void set_sample_roles(const vector< string > &role_names)
Assigns roles to all samples from a parallel string vector.
Index get_features_number() const
Returns the total number of features.
void set_variable_types(const VariableType &type)
Sets every variable to a given type.
vector< string > get_feature_scalers(const string &role_name) const
Returns the scaler chosen for each variable of a given role.
void missing_values_to_JSON(JsonWriter &) const
Serializes the missing-value statistics to JSON.
void set_data_constant(const float value)
Fills the data matrix with a constant value.
Definition json.h:71
Definition json.h:84
Definition json.h:22
Definition adaptive_moment_estimation.h:19
Matrix< float, Dynamic, Dynamic, Layout > MatrixR
Definition neural_network.h:152
Matrix< Index, Dynamic, 1 > VectorI
Definition neural_network.h:157
Matrix< float, Dynamic, 1 > VectorR
Definition neural_network.h:156
const string & sample_role_to_string(SampleRole role)
Converts a SampleRole to its canonical string name.
Definition dataset.h:68
vector< Descriptives > descriptives(const MatrixR &)
const EnumMap< SampleRole > & sample_role_map()
Returns the string<->enum mapping for SampleRole values.
Definition dataset.h:51
SampleRole string_to_sample_role(const string &name)
Parses a SampleRole from string.
Definition dataset.h:81
SampleRole
Role of a sample within a dataset partition.
Definition dataset.h:40
@ Validation
Definition dataset.h:42
@ None
Definition dataset.h:44
@ Training
Definition dataset.h:41
@ Testing
Definition dataset.h:43
VariableType
Definition variable.h:18
DateFormat
Definition string_utilities.h:32
void shuffle(VectorB &vector_to_shuffle)
void replace(string &, const string &, const string &)
Definition correlations.h:17
Method
Definition correlations.h:18
Definition enum_map.h:18
const string & to_string(Enum value) const
Definition enum_map.h:23
Definition tensor_utilities.h:46