53 static const vector<pair<SampleRole, string>> entries = {
139 bool has_ids =
false,
384 virtual void get_batches(
const vector<Index>& sample_indices, Index batch_size,
bool shuffle, vector<vector<Index>>& batches)
const;
547 bool has_ids =
false,
554 void set(
const filesystem::path& file_name);
850 void split_samples(
const float training_ratio = 0.6f,
float selection_ratio = 0.2f,
float testing_ratio = 0.2f,
bool shuffle =
true);
866 void split_samples_random(
const float training_ratio = 0.6f,
float selection_ratio = 0.2f,
float testing_ratio = 0.2f);
1011 void unscale_features(
const string& role_name,
const vector<Descriptives>& feature_descriptives);
1077 void save(
const filesystem::path& file_name)
const;
1083 void load(
const filesystem::path& file_name);
1176 vector<vector<Index>>
split_samples(
const vector<Index>& indices, Index parts_number)
const;
1205 virtual void fill_inputs(
const vector<Index>& sample_indices,
const vector<Index>& feature_indices,
float* buffer,
bool transpose =
true,
int contiguous = -1)
const;
1226 const vector<Index>& feature_indices,
1228 bool transpose =
true,
1229 int contiguous = -1)
const;
1239 virtual void fill_targets(
const vector<Index>& sample_indices,
const vector<Index>& feature_indices,
float* buffer,
bool transpose =
true,
int contiguous = -1)
const;
1244 void infer_variable_types_from_data();
1247 void unuse_constant_variables();
1269 vector<Index> filter_used_samples_by_column(Index,
bool)
const;
1278 void apply_scaler(Index feature_index,
const string& scaler,
const Descriptives&
descriptives,
bool inverse);
1297 void check_separators(
const string& file_path)
const;
1382 const vector<string>
positive_words = {
"1",
"yes",
"positive",
"+",
"true",
"good",
"si",
"sÃ",
"SÃ"};
1384 const vector<string>
negative_words = {
"0",
"no",
"negative",
"-",
"false",
"bad",
"not",
"No"};
1407#define STRINGIFY_ENUM(x) #x
1409#define ENUM_TO_STRING(x) STRINGIFY_ENUM(x)
DateFormat infer_dataset_date_format(const vector< Variable > &variables, const vector< vector< string > > &data_file_preview, bool has_header, const string &missing_values_label)
Infers the date format used by date-typed variables in the source file.
virtual void read_csv()
Reads the configured CSV file into the data matrix.
MissingValuesMethod
Strategy for replacing missing values.
Definition dataset.h:154
@ Unuse
Definition dataset.h:154
@ Mean
Definition dataset.h:154
@ Interpolation
Definition dataset.h:154
@ Median
Definition dataset.h:154
virtual void fill_targets(const vector< Index > &sample_indices, const vector< Index > &feature_indices, float *buffer, bool transpose=true, int contiguous=-1) const
Fills a contiguous buffer with target data for a batch of samples.
void set(const filesystem::path &file_name)
Resets the dataset by loading a previously serialized JSON state.
void split_samples_sequential(const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
Splits samples into partitions in their original order.
void set_variable_role(const Index variable_index, const string &role_name)
Sets the role of a single variable by index.
vector< string > get_feature_names() const
Returns the names of every feature.
Index count_nan() const
Counts the total number of NaN cells.
Shape get_shape(const string &role_name) const
Returns the input or target shape used by the network.
vector< Descriptives > scale_data()
Scales the entire data matrix.
Tensor< Correlation, 2 > calculate_input_variable_pearson_correlations() const
Computes Pearson correlations between every pair of input variables.
Tensor< Correlation, 2 > calculate_input_target_variable_correlations(Correlation(*correlation_function)(const MatrixR &, const MatrixR &), const string &samples_role) const
Computes a custom correlation between inputs and targets.
Index get_samples_number(const string &role_name) const
Returns the number of samples assigned to a given role.
void preview_data_to_JSON(JsonWriter &) const
Serializes the source-file preview to JSON.
void set_input_variables_unused()
Marks all input variables as Unused.
VectorI calculate_target_distribution() const
Counts the samples of every target class.
void set_variables_number(const Index new_size)
Resizes the variables vector.
Definition dataset.h:666
void set_variable_scalers(const string &scaler_name)
Sets the same scaler on every variable.
vector< Index > get_used_variables_indices() const
Returns the column indices of all variables that are not "Unused".
virtual vector< Descriptives > scale_features(const string &role_name)
Scales the features of a given role.
Tensor< Correlation, 2 > calculate_input_target_variable_spearman_correlations() const
Computes Spearman rank correlations between inputs and targets.
void split_samples_random(const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
Splits samples into partitions after random shuffling.
void set_separator(const Separator &new_separator)
Sets the field separator.
Definition dataset.h:739
vector< Index > get_sample_roles_vector() const
Returns the per-sample role indices as plain integers.
MatrixR get_data(const string &sample_role, const string &variable_role) const
Returns the data restricted to a sample-role and variable-role intersection.
Codification codification
Source-file character encoding.
Definition dataset.h:1354
void set_gmt(const Index new_gmt)
Sets the GMT offset for time variables.
Definition dataset.h:787
void set_display(bool new_display)
Toggles progress messages.
Definition dataset.h:793
string get_missing_values_method_string() const
Returns the missing-value strategy as a string.
bool has_header
Whether the source file's first row contains column names.
Definition dataset.h:1348
Index get_missing_values_number() const
Returns the total number of cells flagged as missing.
Definition dataset.h:1105
string get_separator_name() const
Returns the field separator as a human-readable name.
MatrixR data
Dense data matrix [samples x variables].
Definition dataset.h:1310
virtual void impute_missing_values_unuse()
Marks samples with missing values as Unused.
void set_variables(const string &description)
Re-creates the variables vector from an input/target shape descriptor.
vector< Index > get_variable_indices(const string &role_name) const
Returns the column indices of the variables assigned to a given role.
Shape target_shape
Shape of the target portion.
Definition dataset.h:1317
vector< vector< Index > > calculate_Tukey_outliers(const float tukey_factor=1.5f, bool replace=false)
Detects Tukey outliers per variable.
virtual void from_JSON(const JsonDocument &document)
Restores the dataset state from a JSON document.
void set_data(const MatrixR &new_data)
Replaces the data matrix.
void set_codification(const string &new_codification)
Sets the source-file codification from its name.
bool has_sample_ids
Whether the source file's first column contains sample identifiers.
Definition dataset.h:1351
vector< vector< Index > > split_samples(const vector< Index > &indices, Index parts_number) const
Splits a list of sample indices into chunks of (roughly) equal size.
void set_codification(const Codification &new_codification)
Sets the source-file codification.
Definition dataset.h:757
virtual void to_JSON(JsonWriter &writer) const
Serializes the dataset state to JSON.
bool is_empty() const
Reports whether the data matrix is empty.
Definition dataset.h:514
Tensor< Correlation, 2 > calculate_input_variable_correlations(Correlation(*correlation_function)(const MatrixR &, const MatrixR &), Correlation::Method method, const string &samples_role) const
Computes a custom correlation between every pair of input variables.
Shape get_target_shape() const
Returns the target shape.
Definition dataset.h:526
vector< Descriptives > calculate_variable_descriptives_categories(const Index variable_index) const
Computes descriptives per category of a categorical variable.
Shape decoder_shape
Shape of the decoder portion (transformer-style models).
Definition dataset.h:1319
void scrub_missing_values()
Removes samples that contain missing values.
const vector< SampleRole > & get_sample_roles() const
Returns the per-sample role assignments.
Definition dataset.h:192
void set_variable_type(const string &variable_name, const VariableType &type)
Sets the type of a single variable by name.
vector< Descriptives > calculate_variable_descriptives_positive_samples() const
Computes descriptives for inputs restricted to positive-target samples.
Dataset(const filesystem::path &data_path, const string &separator, bool has_header=true, bool has_ids=false, const Codification &codification=Codification::UTF8)
Constructs a dataset by loading a delimited text file.
void save_data_binary(const filesystem::path &file_name) const
Saves the data matrix as a binary file.
void set_missing_values_method(const string &method_name)
Sets the missing-value handling strategy from its name.
void unuse_Tukey_outliers(const float tukey_factor=1.5f)
Marks Tukey-outlier samples as Unused.
void impute_missing_values_statistic(const MissingValuesMethod &method)
Replaces missing values with a per-variable statistic.
const vector< string > negative_words
Strings interpreted as negative when parsing binary variables.
Definition dataset.h:1384
virtual void impute_missing_values_interpolate()
Replaces missing values via linear interpolation along each variable.
virtual void set_variable_roles(const vector< string > &role_names)
Assigns roles to all variables from a parallel string vector.
void set_sample_roles(const vector< Index > &sample_indices, const string &role_name)
Assigns the same role to a list of samples.
void set_default()
Resets configuration members to defaults.
vector< string > sample_ids
Optional per-sample identifiers (when the source file has an id column).
Definition dataset.h:1327
void set_separator_string(const string &new_separator_string)
Sets the field separator from its delimiter character(s).
string missing_values_label
Label that marks missing values in the source file.
Definition dataset.h:1343
vector< SampleRole > sample_roles
Per-sample role (Training/Validation/Testing/None).
Definition dataset.h:1324
MatrixR get_variable_data(const string &variable_name) const
Returns the data for a single variable identified by name.
void set_sample_roles(const string &role_name)
Assigns the same role to every sample.
void load_data_binary()
Loads the data matrix from a binary file produced by save_data_binary().
Index get_variables_number() const
Returns the total number of variables (columns of data).
Definition dataset.h:210
void save_data() const
Saves the data matrix back to the configured source file.
vector< string > get_variable_names() const
Returns the names of every variable.
void set_binary_variables()
Detects binary variables (two distinct values) and tags them accordingly.
void set_default_variable_roles_forecasting()
Sets default roles for forecasting (typical pattern: lagged inputs + future target).
vector< Descriptives > calculate_feature_descriptives(const string &role_name) const
Computes feature descriptives for a single role.
const MatrixR & get_data() const
Returns the raw data matrix.
Definition dataset.h:390
virtual void set_data_random()
Fills the data matrix with uniform random values.
void save(const filesystem::path &file_name) const
Saves the dataset state to a JSON file on disk.
vector< string > get_variable_names(const string &role_name) const
Returns the names of the variables assigned to a given role.
vector< string > get_feature_names(const string &role_name) const
Returns the names of the features assigned to a given role.
vector< vector< Index > > get_feature_indices() const
Returns the per-variable feature indices.
VectorI get_sample_role_numbers() const
Counts the samples assigned to each role.
Codification
Source-file character encoding.
Definition dataset.h:116
@ SHIFT_JIS
Definition dataset.h:116
@ UTF8
Definition dataset.h:116
Index get_used_samples_number() const
Returns the number of samples that are not "None".
vector< Index > get_feature_dimensions() const
Returns the per-variable feature dimension (1 for Numeric, N for Categorical).
bool has_binary_variables() const
Reports whether at least one variable is binary.
const string get_codification_string() const
Returns the codification as a string.
void set_data_path(const filesystem::path &new_data_path)
Sets the path to the source data file.
Definition dataset.h:721
void variables_from_JSON(const Json *)
Restores the variables vector from JSON.
virtual void set_data_integer(const Index vocabulary_size)
Fills the data matrix with random integers in [0, vocabulary_size).
vector< Histogram > calculate_variable_distributions(const Index bins_number=10) const
Builds histograms of every variable.
VectorI calculate_correlations_rank() const
Returns the rank of every input variable by absolute Pearson correlation against targets.
MissingValuesMethod get_missing_values_method() const
Returns the configured missing-value strategy.
Definition dataset.h:454
Index missing_values_number
Total number of missing cells.
Definition dataset.h:1368
void set_default_variable_roles()
Sets default Input/Target roles based on column position (last column = target).
vector< Variable > get_variables(const string &role_name) const
Returns the variables assigned to a given role.
const vector< vector< string > > & get_data_file_preview() const
Returns the cached preview of the source file (first rows).
Definition dataset.h:448
void set_has_ids(bool new_has_ids)
Sets whether the source file has a sample-id column.
Definition dataset.h:733
Index count_variables_with_nan() const
Counts variables that contain at least one NaN.
Index count_rows_with_nan() const
Counts samples that contain at least one NaN.
Separator separator
Field separator used in the source file.
Definition dataset.h:1340
Tensor< Correlation, 2 > calculate_input_variable_spearman_correlations() const
Computes Spearman rank correlations between every pair of input variables.
void samples_from_JSON(const Json *)
Restores the per-sample roles from JSON.
Shape get_input_shape() const
Returns the input shape.
Definition dataset.h:520
void missing_values_from_JSON(const Json *)
Restores the missing-value statistics from JSON.
vector< Variable > variables
Per-variable metadata (name, role, type, scaler).
Definition dataset.h:1332
void set(const Index samples_number=0, const Shape &input_shape={}, const Shape &target_shape={})
Resets the dataset to a synthetic shape.
vector< Index > get_feature_indices(const string &role_name) const
Returns the feature indices for variables of a given role.
virtual void get_batches(const vector< Index > &sample_indices, Index batch_size, bool shuffle, vector< vector< Index > > &batches) const
Splits a list of sample indices into batches.
bool has_binary_or_categorical_variables() const
Reports whether the dataset has any binary or categorical variable.
void set_default_variable_names()
Sets default names ("variable_1", "variable_2", ...) for every variable.
vector< Index > get_used_feature_indices() const
Returns the feature indices for all variables that are not "Unused".
void set(const filesystem::path &data_path, const string &separator, bool has_header=true, bool has_ids=false, const Dataset::Codification &codification=Codification::UTF8)
Resets the dataset by loading a delimited text file.
vector< string > unuse_collinear_variables(const float maximum_correlation=0.95f)
Marks variables strongly correlated against another input as Unused.
void set_variable_names(const vector< string > &new_variable_names)
Replaces the names of every variable.
Index get_variable_index(const string &name) const
Returns the column index of the variable with a given name.
void set_missing_values_label(string label)
Sets the label used for missing values in the source file.
Definition dataset.h:769
VectorR get_sample_data(const Index sample_index) const
Returns a single sample as a row vector.
vector< Descriptives > calculate_variable_descriptives_negative_samples() const
Computes descriptives for inputs restricted to negative-target samples.
Dataset(const Index samples_number=0, const Shape &input_shape={0}, const Shape &target_shape={0})
Constructs an empty dataset of given dimensions.
vector< vector< string > > data_file_preview
Cached preview of the first rows of the source file.
Definition dataset.h:1357
Index rows_missing_values_number
Number of rows that contain at least one missing value.
Definition dataset.h:1374
Index get_samples_number() const
Returns the total number of samples (rows of data).
Definition dataset.h:160
MissingValuesMethod missing_values_method
Strategy used to handle missing values.
Definition dataset.h:1365
vector< Index > get_feature_indices(const Index variable_index) const
Returns the feature indices for a single variable.
bool has_categorical_variables() const
Reports whether at least one variable is categorical.
void set_variable_indices(const vector< Index > &input_indices, const vector< Index > &target_indices)
Marks selected variables as Input and others as Target.
MatrixR get_data_from_indices(const vector< Index > &sample_indices, const vector< Index > &variable_indices) const
Returns the data restricted to specific samples and variables.
bool has_validation() const
Reports whether at least one sample is assigned to Validation.
const vector< string > positive_words
Strings interpreted as positive when parsing binary variables.
Definition dataset.h:1382
bool has_missing_values(const vector< string > &labels) const
Reports whether the dataset has missing values matching any of the supplied labels.
void set_variable_roles(const string &role_name)
Assigns the same role to every variable.
Tensor< Correlation, 2 > calculate_input_target_variable_pearson_correlations() const
Computes Pearson correlations between inputs and targets.
void set_missing_values_method(const MissingValuesMethod &method)
Sets the missing-value handling strategy.
Definition dataset.h:775
const Separator & get_separator() const
Returns the configured field separator.
Definition dataset.h:472
bool has_nan_row(const Index row_index) const
Reports whether a row contains any NaN.
vector< vector< Index > > replace_Tukey_outliers_with_NaN(const float tukey_factor=1.5f)
Detects Tukey outliers and replaces them with NaN.
void load(const filesystem::path &file_name)
Loads the dataset state from a JSON file on disk.
MatrixR get_variable_data(const Index variable_index, const vector< Index > &sample_indices) const
Returns the data for a single variable on a subset of samples.
Shape input_shape
Shape of the input portion (rank may exceed 1 for image/sequence data).
Definition dataset.h:1315
Index gmt
GMT offset for time variables, in hours.
Definition dataset.h:1360
const filesystem::path & get_data_path() const
Returns the path to the source data file.
Definition dataset.h:466
vector< Descriptives > calculate_feature_descriptives() const
Computes descriptive statistics for every feature.
void set_variable_role(const string &variable_name, const string &role_name)
Sets the role of a single variable by name.
void set_variables(const vector< Variable > &new_variables)
Replaces the per-variable metadata.
Definition dataset.h:591
vector< Index > get_used_sample_indices() const
Returns the indices of all samples that are not "None".
Index get_used_variables_number() const
Returns the number of variables that are not "Unused".
void set_shape(const string &role_name, const Shape &new_shape)
Sets the input or target shape.
void set_data_rosenbrock()
Fills the data matrix with samples from the Rosenbrock function.
bool has_time_variable() const
Reports whether at least one variable plays the Time role.
virtual void augment_inputs(float *buffer, Index batch_size) const
Optionally augments inputs in-place after fill_inputs() (e.g. random crops).
Definition dataset.h:1215
void samples_to_JSON(JsonWriter &) const
Serializes the per-sample roles to JSON.
virtual void fill_decoder(const vector< Index > &sample_indices, const vector< Index > &feature_indices, float *buffer, bool transpose=true, int contiguous=-1) const
Fills a contiguous buffer with decoder-side inputs (transformer-style models).
vector< BoxPlot > calculate_variables_box_plots() const
Computes box-plot statistics for every variable.
bool has_nan() const
Reports whether the data matrix contains any NaN.
void set_sample_role(const Index sample_index, const string &role_name)
Assigns a role to a single sample.
void set_variable_type(const Index variable_index, const VariableType &type)
Sets the type of a single variable by index.
vector< string > unuse_uncorrelated_variables(const float minimum_correlation=0.25f)
Marks variables with low correlation against the target as Unused.
MatrixR get_feature_data(const string &role_name) const
Returns the data matrix restricted to the features of a given role.
VectorI count_nans_per_variable() const
Counts NaN cells per variable.
void set_data_binary_classification()
Fills the data matrix with synthetic binary-classification data.
void set_separator_name(const string &new_separator_name)
Sets the field separator from its human-readable name.
void set_feature_names(const vector< string > &new_feature_names)
Names every feature.
const vector< Variable > & get_variables() const
Returns the per-variable metadata.
Definition dataset.h:229
bool get_display() const
Reports whether progress messages are printed.
Definition dataset.h:508
filesystem::path data_path
Path to the source data file.
Definition dataset.h:1337
Separator
Field-separator type for tabular files.
Definition dataset.h:148
@ Comma
Definition dataset.h:148
@ Tab
Definition dataset.h:148
@ Semicolon
Definition dataset.h:148
@ Space
Definition dataset.h:148
const Codification & get_codification() const
Returns the configured source-file codification.
Definition dataset.h:490
string get_separator_string() const
Returns the field separator as the actual delimiter character(s).
void set_variable_scalers(const vector< string > &scaler_names)
Sets one scaler per variable.
void split_samples(const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f, bool shuffle=true)
Splits samples into training/validation/testing partitions.
vector< VariableType > get_variable_types(const vector< Index > indices) const
Returns the types of a list of variables.
VariableType get_variable_type(const Index index) const
Returns the type of a variable (Numeric, Binary, Categorical, ...).
Definition dataset.h:283
vector< Index > get_sample_indices(const string &role_name) const
Returns the indices of the samples assigned to a given role.
bool is_sample_used(const Index i) const
Reports whether a sample is used (any role other than None).
Definition dataset.h:800
VectorI variables_missing_values_number
Per-variable missing-value count.
Definition dataset.h:1371
void calculate_missing_values_statistics()
Updates the cached missing-value statistics (counts, indices).
void preview_data_from_JSON(const Json *)
Restores the source-file preview from JSON.
Index get_used_features_number() const
Returns the number of features that are not "Unused".
bool display
Whether to print progress messages.
Definition dataset.h:1379
Index get_variable_index(const Index id) const
Returns the column index of the variable with a given numeric id.
void set_default_variable_scalers()
Picks default scalers for every variable based on its type.
const string & get_missing_values_label() const
Returns the label that marks missing values in the source file.
Definition dataset.h:502
virtual void fill_inputs(const vector< Index > &sample_indices, const vector< Index > &feature_indices, float *buffer, bool transpose=true, int contiguous=-1) const
Fills a contiguous float buffer with input data for a batch of samples.
void set_has_header(bool new_has_header)
Sets whether the source file has a header row.
Definition dataset.h:727
void unscale_features(const string &role_name, const vector< Descriptives > &feature_descriptives)
Inverse-scales the features of a given role.
Index get_variables_number(const string &role_name) const
Returns the number of variables assigned to a given role.
Index get_features_number(const string &role_name) const
Returns the number of features assigned to a given role.
MatrixR get_variable_data(const Index variable_index) const
Returns the data for a single variable across all samples.
void variables_to_JSON(JsonWriter &) const
Serializes the variables vector to JSON.
void set_sample_roles(const vector< string > &role_names)
Assigns roles to all samples from a parallel string vector.
Index get_features_number() const
Returns the total number of features.
void set_variable_types(const VariableType &type)
Sets every variable to a given type.
vector< string > get_feature_scalers(const string &role_name) const
Returns the scaler chosen for each variable of a given role.
void missing_values_to_JSON(JsonWriter &) const
Serializes the missing-value statistics to JSON.
void set_data_constant(const float value)
Fills the data matrix with a constant value.
Definition adaptive_moment_estimation.h:19
Matrix< float, Dynamic, Dynamic, Layout > MatrixR
Definition neural_network.h:152
Matrix< Index, Dynamic, 1 > VectorI
Definition neural_network.h:157
Matrix< float, Dynamic, 1 > VectorR
Definition neural_network.h:156
const string & sample_role_to_string(SampleRole role)
Converts a SampleRole to its canonical string name.
Definition dataset.h:68
vector< Descriptives > descriptives(const MatrixR &)
const EnumMap< SampleRole > & sample_role_map()
Returns the string<->enum mapping for SampleRole values.
Definition dataset.h:51
SampleRole string_to_sample_role(const string &name)
Parses a SampleRole from string.
Definition dataset.h:81
SampleRole
Role of a sample within a dataset partition.
Definition dataset.h:40
@ Validation
Definition dataset.h:42
@ None
Definition dataset.h:44
@ Training
Definition dataset.h:41
@ Testing
Definition dataset.h:43
VariableType
Definition variable.h:18
DateFormat
Definition string_utilities.h:32
void shuffle(VectorB &vector_to_shuffle)
void replace(string &, const string &, const string &)
Definition correlations.h:17
Method
Definition correlations.h:18
const string & to_string(Enum value) const
Definition enum_map.h:23
Definition tensor_utilities.h:46