OpenNN
Open-source neural networks library
Loading...
Searching...
No Matches
opennn::Dataset Class Reference

Base data container with samples, variables and per-variable metadata. More...

#include <dataset.h>

Inheritance diagram for opennn::Dataset:
[legend]

Public Types

enum class  Codification { UTF8 , SHIFT_JIS }
 Source-file character encoding. More...
 
enum class  Separator { Space , Tab , Comma , Semicolon }
 Field-separator type for tabular files. More...
 
enum class  MissingValuesMethod { Unuse , Mean , Median , Interpolation }
 Strategy for replacing missing values. More...
 

Public Member Functions

 Dataset (const Index samples_number=0, const Shape &input_shape={0}, const Shape &target_shape={0})
 Constructs an empty dataset of given dimensions.
 
 Dataset (const filesystem::path &data_path, const string &separator, bool has_header=true, bool has_ids=false, const Codification &codification=Codification::UTF8)
 Constructs a dataset by loading a delimited text file.
 
Index get_samples_number () const
 Returns the total number of samples (rows of data).
 
Index get_samples_number (const string &role_name) const
 Returns the number of samples assigned to a given role.
 
Index get_used_samples_number () const
 Returns the number of samples that are not "None".
 
vector< Index > get_sample_indices (const string &role_name) const
 Returns the indices of the samples assigned to a given role.
 
vector< Index > get_used_sample_indices () const
 Returns the indices of all samples that are not "None".
 
const vector< SampleRole > & get_sample_roles () const
 Returns the per-sample role assignments.
 
vector< Index > get_sample_roles_vector () const
 Returns the per-sample role indices as plain integers.
 
VectorI get_sample_role_numbers () const
 Counts the samples assigned to each role.
 
Index get_variables_number () const
 Returns the total number of variables (columns of data).
 
Index get_variables_number (const string &role_name) const
 Returns the number of variables assigned to a given role.
 
Index get_used_variables_number () const
 Returns the number of variables that are not "Unused".
 
const vector< Variable > & get_variables () const
 Returns the per-variable metadata.
 
vector< Variableget_variables (const string &role_name) const
 Returns the variables assigned to a given role.
 
Index get_variable_index (const string &name) const
 Returns the column index of the variable with a given name.
 
Index get_variable_index (const Index id) const
 Returns the column index of the variable with a given numeric id.
 
vector< Index > get_variable_indices (const string &role_name) const
 Returns the column indices of the variables assigned to a given role.
 
vector< Index > get_used_variables_indices () const
 Returns the column indices of all variables that are not "Unused".
 
vector< string > get_variable_names () const
 Returns the names of every variable.
 
vector< string > get_variable_names (const string &role_name) const
 Returns the names of the variables assigned to a given role.
 
VariableType get_variable_type (const Index index) const
 Returns the type of a variable (Numeric, Binary, Categorical, ...).
 
vector< VariableTypeget_variable_types (const vector< Index > indices) const
 Returns the types of a list of variables.
 
Index get_features_number () const
 Returns the total number of features.
 
Index get_features_number (const string &role_name) const
 Returns the number of features assigned to a given role.
 
Index get_used_features_number () const
 Returns the number of features that are not "Unused".
 
vector< string > get_feature_names () const
 Returns the names of every feature.
 
vector< string > get_feature_names (const string &role_name) const
 Returns the names of the features assigned to a given role.
 
vector< vector< Index > > get_feature_indices () const
 Returns the per-variable feature indices.
 
vector< Index > get_feature_indices (const Index variable_index) const
 Returns the feature indices for a single variable.
 
vector< Index > get_feature_indices (const string &role_name) const
 Returns the feature indices for variables of a given role.
 
vector< Index > get_used_feature_indices () const
 Returns the feature indices for all variables that are not "Unused".
 
vector< Index > get_feature_dimensions () const
 Returns the per-variable feature dimension (1 for Numeric, N for Categorical).
 
Shape get_shape (const string &role_name) const
 Returns the input or target shape used by the network.
 
vector< string > get_feature_scalers (const string &role_name) const
 Returns the scaler chosen for each variable of a given role.
 
virtual void get_batches (const vector< Index > &sample_indices, Index batch_size, bool shuffle, vector< vector< Index > > &batches) const
 Splits a list of sample indices into batches.
 
const MatrixRget_data () const
 Returns the raw data matrix.
 
MatrixR get_feature_data (const string &role_name) const
 Returns the data matrix restricted to the features of a given role.
 
MatrixR get_data (const string &sample_role, const string &variable_role) const
 Returns the data restricted to a sample-role and variable-role intersection.
 
MatrixR get_data_from_indices (const vector< Index > &sample_indices, const vector< Index > &variable_indices) const
 Returns the data restricted to specific samples and variables.
 
VectorR get_sample_data (const Index sample_index) const
 Returns a single sample as a row vector.
 
MatrixR get_variable_data (const Index variable_index) const
 Returns the data for a single variable across all samples.
 
MatrixR get_variable_data (const Index variable_index, const vector< Index > &sample_indices) const
 Returns the data for a single variable on a subset of samples.
 
MatrixR get_variable_data (const string &variable_name) const
 Returns the data for a single variable identified by name.
 
const vector< vector< string > > & get_data_file_preview () const
 Returns the cached preview of the source file (first rows).
 
MissingValuesMethod get_missing_values_method () const
 Returns the configured missing-value strategy.
 
string get_missing_values_method_string () const
 Returns the missing-value strategy as a string.
 
const filesystem::path & get_data_path () const
 Returns the path to the source data file.
 
const Separatorget_separator () const
 Returns the configured field separator.
 
string get_separator_string () const
 Returns the field separator as the actual delimiter character(s).
 
string get_separator_name () const
 Returns the field separator as a human-readable name.
 
const Codificationget_codification () const
 Returns the configured source-file codification.
 
const string get_codification_string () const
 Returns the codification as a string.
 
const string & get_missing_values_label () const
 Returns the label that marks missing values in the source file.
 
bool get_display () const
 Reports whether progress messages are printed.
 
bool is_empty () const
 Reports whether the data matrix is empty.
 
Shape get_input_shape () const
 Returns the input shape.
 
Shape get_target_shape () const
 Returns the target shape.
 
void set (const Index samples_number=0, const Shape &input_shape={}, const Shape &target_shape={})
 Resets the dataset to a synthetic shape.
 
void set (const filesystem::path &data_path, const string &separator, bool has_header=true, bool has_ids=false, const Dataset::Codification &codification=Codification::UTF8)
 Resets the dataset by loading a delimited text file.
 
void set (const filesystem::path &file_name)
 Resets the dataset by loading a previously serialized JSON state.
 
void set_default ()
 Resets configuration members to defaults.
 
void set_sample_roles (const string &role_name)
 Assigns the same role to every sample.
 
void set_sample_role (const Index sample_index, const string &role_name)
 Assigns a role to a single sample.
 
void set_sample_roles (const vector< string > &role_names)
 Assigns roles to all samples from a parallel string vector.
 
void set_sample_roles (const vector< Index > &sample_indices, const string &role_name)
 Assigns the same role to a list of samples.
 
void set_variables (const vector< Variable > &new_variables)
 Replaces the per-variable metadata.
 
void set_default_variable_names ()
 Sets default names ("variable_1", "variable_2", ...) for every variable.
 
virtual void set_variable_roles (const vector< string > &role_names)
 Assigns roles to all variables from a parallel string vector.
 
void set_variables (const string &description)
 Re-creates the variables vector from an input/target shape descriptor.
 
void set_variable_indices (const vector< Index > &input_indices, const vector< Index > &target_indices)
 Marks selected variables as Input and others as Target.
 
void set_input_variables_unused ()
 Marks all input variables as Unused.
 
void set_variable_role (const Index variable_index, const string &role_name)
 Sets the role of a single variable by index.
 
void set_variable_role (const string &variable_name, const string &role_name)
 Sets the role of a single variable by name.
 
void set_variable_type (const Index variable_index, const VariableType &type)
 Sets the type of a single variable by index.
 
void set_variable_type (const string &variable_name, const VariableType &type)
 Sets the type of a single variable by name.
 
void set_variable_types (const VariableType &type)
 Sets every variable to a given type.
 
void set_variable_names (const vector< string > &new_variable_names)
 Replaces the names of every variable.
 
void set_variables_number (const Index new_size)
 Resizes the variables vector.
 
void set_variable_scalers (const string &scaler_name)
 Sets the same scaler on every variable.
 
void set_variable_scalers (const vector< string > &scaler_names)
 Sets one scaler per variable.
 
void set_binary_variables ()
 Detects binary variables (two distinct values) and tags them accordingly.
 
void set_feature_names (const vector< string > &new_feature_names)
 Names every feature.
 
void set_variable_roles (const string &role_name)
 Assigns the same role to every variable.
 
void set_shape (const string &role_name, const Shape &new_shape)
 Sets the input or target shape.
 
void set_data (const MatrixR &new_data)
 Replaces the data matrix.
 
void set_data_path (const filesystem::path &new_data_path)
 Sets the path to the source data file.
 
void set_has_header (bool new_has_header)
 Sets whether the source file has a header row.
 
void set_has_ids (bool new_has_ids)
 Sets whether the source file has a sample-id column.
 
void set_separator (const Separator &new_separator)
 Sets the field separator.
 
void set_separator_string (const string &new_separator_string)
 Sets the field separator from its delimiter character(s).
 
void set_separator_name (const string &new_separator_name)
 Sets the field separator from its human-readable name.
 
void set_codification (const Codification &new_codification)
 Sets the source-file codification.
 
void set_codification (const string &new_codification)
 Sets the source-file codification from its name.
 
void set_missing_values_label (string label)
 Sets the label used for missing values in the source file.
 
void set_missing_values_method (const MissingValuesMethod &method)
 Sets the missing-value handling strategy.
 
void set_missing_values_method (const string &method_name)
 Sets the missing-value handling strategy from its name.
 
void set_gmt (const Index new_gmt)
 Sets the GMT offset for time variables.
 
void set_display (bool new_display)
 Toggles progress messages.
 
bool is_sample_used (const Index i) const
 Reports whether a sample is used (any role other than None).
 
bool has_binary_variables () const
 Reports whether at least one variable is binary.
 
bool has_categorical_variables () const
 Reports whether at least one variable is categorical.
 
bool has_binary_or_categorical_variables () const
 Reports whether the dataset has any binary or categorical variable.
 
bool has_time_variable () const
 Reports whether at least one variable plays the Time role.
 
bool has_validation () const
 Reports whether at least one sample is assigned to Validation.
 
bool has_missing_values (const vector< string > &labels) const
 Reports whether the dataset has missing values matching any of the supplied labels.
 
void split_samples (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f, bool shuffle=true)
 Splits samples into training/validation/testing partitions.
 
void split_samples_sequential (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
 Splits samples into partitions in their original order.
 
void split_samples_random (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
 Splits samples into partitions after random shuffling.
 
vector< string > unuse_uncorrelated_variables (const float minimum_correlation=0.25f)
 Marks variables with low correlation against the target as Unused.
 
vector< string > unuse_collinear_variables (const float maximum_correlation=0.95f)
 Marks variables strongly correlated against another input as Unused.
 
void set_data_constant (const float value)
 Fills the data matrix with a constant value.
 
vector< Descriptivescalculate_feature_descriptives () const
 Computes descriptive statistics for every feature.
 
vector< Descriptivescalculate_variable_descriptives_positive_samples () const
 Computes descriptives for inputs restricted to positive-target samples.
 
vector< Descriptivescalculate_variable_descriptives_negative_samples () const
 Computes descriptives for inputs restricted to negative-target samples.
 
vector< Descriptivescalculate_variable_descriptives_categories (const Index variable_index) const
 Computes descriptives per category of a categorical variable.
 
vector< Descriptivescalculate_feature_descriptives (const string &role_name) const
 Computes feature descriptives for a single role.
 
vector< Histogramcalculate_variable_distributions (const Index bins_number=10) const
 Builds histograms of every variable.
 
vector< BoxPlotcalculate_variables_box_plots () const
 Computes box-plot statistics for every variable.
 
Tensor< Correlation, 2 > calculate_input_variable_correlations (Correlation(*correlation_function)(const MatrixR &, const MatrixR &), Correlation::Method method, const string &samples_role) const
 Computes a custom correlation between every pair of input variables.
 
Tensor< Correlation, 2 > calculate_input_variable_pearson_correlations () const
 Computes Pearson correlations between every pair of input variables.
 
Tensor< Correlation, 2 > calculate_input_variable_spearman_correlations () const
 Computes Spearman rank correlations between every pair of input variables.
 
Tensor< Correlation, 2 > calculate_input_target_variable_correlations (Correlation(*correlation_function)(const MatrixR &, const MatrixR &), const string &samples_role) const
 Computes a custom correlation between inputs and targets.
 
Tensor< Correlation, 2 > calculate_input_target_variable_pearson_correlations () const
 Computes Pearson correlations between inputs and targets.
 
Tensor< Correlation, 2 > calculate_input_target_variable_spearman_correlations () const
 Computes Spearman rank correlations between inputs and targets.
 
VectorI calculate_correlations_rank () const
 Returns the rank of every input variable by absolute Pearson correlation against targets.
 
void set_default_variable_scalers ()
 Picks default scalers for every variable based on its type.
 
vector< Descriptivesscale_data ()
 Scales the entire data matrix.
 
virtual vector< Descriptivesscale_features (const string &role_name)
 Scales the features of a given role.
 
void unscale_features (const string &role_name, const vector< Descriptives > &feature_descriptives)
 Inverse-scales the features of a given role.
 
VectorI calculate_target_distribution () const
 Counts the samples of every target class.
 
vector< vector< Index > > calculate_Tukey_outliers (const float tukey_factor=1.5f, bool replace=false)
 Detects Tukey outliers per variable.
 
vector< vector< Index > > replace_Tukey_outliers_with_NaN (const float tukey_factor=1.5f)
 Detects Tukey outliers and replaces them with NaN.
 
void unuse_Tukey_outliers (const float tukey_factor=1.5f)
 Marks Tukey-outlier samples as Unused.
 
virtual void set_data_random ()
 Fills the data matrix with uniform random values.
 
virtual void set_data_integer (const Index vocabulary_size)
 Fills the data matrix with random integers in [0, vocabulary_size).
 
void set_data_rosenbrock ()
 Fills the data matrix with samples from the Rosenbrock function.
 
void set_data_binary_classification ()
 Fills the data matrix with synthetic binary-classification data.
 
virtual void from_JSON (const JsonDocument &document)
 Restores the dataset state from a JSON document.
 
virtual void to_JSON (JsonWriter &writer) const
 Serializes the dataset state to JSON.
 
void save (const filesystem::path &file_name) const
 Saves the dataset state to a JSON file on disk.
 
void load (const filesystem::path &file_name)
 Loads the dataset state from a JSON file on disk.
 
void save_data () const
 Saves the data matrix back to the configured source file.
 
void save_data_binary (const filesystem::path &file_name) const
 Saves the data matrix as a binary file.
 
void load_data_binary ()
 Loads the data matrix from a binary file produced by save_data_binary().
 
Index get_missing_values_number () const
 Returns the total number of cells flagged as missing.
 
bool has_nan () const
 Reports whether the data matrix contains any NaN.
 
bool has_nan_row (const Index row_index) const
 Reports whether a row contains any NaN.
 
virtual void impute_missing_values_unuse ()
 Marks samples with missing values as Unused.
 
void impute_missing_values_statistic (const MissingValuesMethod &method)
 Replaces missing values with a per-variable statistic.
 
virtual void impute_missing_values_interpolate ()
 Replaces missing values via linear interpolation along each variable.
 
void scrub_missing_values ()
 Removes samples that contain missing values.
 
void calculate_missing_values_statistics ()
 Updates the cached missing-value statistics (counts, indices).
 
VectorI count_nans_per_variable () const
 Counts NaN cells per variable.
 
Index count_variables_with_nan () const
 Counts variables that contain at least one NaN.
 
Index count_rows_with_nan () const
 Counts samples that contain at least one NaN.
 
Index count_nan () const
 Counts the total number of NaN cells.
 
vector< vector< Index > > split_samples (const vector< Index > &indices, Index parts_number) const
 Splits a list of sample indices into chunks of (roughly) equal size.
 
virtual void read_csv ()
 Reads the configured CSV file into the data matrix.
 
DateFormat infer_dataset_date_format (const vector< Variable > &variables, const vector< vector< string > > &data_file_preview, bool has_header, const string &missing_values_label)
 Infers the date format used by date-typed variables in the source file.
 
virtual void fill_inputs (const vector< Index > &sample_indices, const vector< Index > &feature_indices, float *buffer, bool transpose=true, int contiguous=-1) const
 Fills a contiguous float buffer with input data for a batch of samples.
 
virtual void augment_inputs (float *buffer, Index batch_size) const
 Optionally augments inputs in-place after fill_inputs() (e.g. random crops).
 
virtual void fill_decoder (const vector< Index > &sample_indices, const vector< Index > &feature_indices, float *buffer, bool transpose=true, int contiguous=-1) const
 Fills a contiguous buffer with decoder-side inputs (transformer-style models).
 
virtual void fill_targets (const vector< Index > &sample_indices, const vector< Index > &feature_indices, float *buffer, bool transpose=true, int contiguous=-1) const
 Fills a contiguous buffer with target data for a batch of samples.
 

Protected Member Functions

void set_default_variable_roles ()
 Sets default Input/Target roles based on column position (last column = target).
 
void set_default_variable_roles_forecasting ()
 Sets default roles for forecasting (typical pattern: lagged inputs + future target).
 
void variables_to_JSON (JsonWriter &) const
 Serializes the variables vector to JSON.
 
void samples_to_JSON (JsonWriter &) const
 Serializes the per-sample roles to JSON.
 
void missing_values_to_JSON (JsonWriter &) const
 Serializes the missing-value statistics to JSON.
 
void preview_data_to_JSON (JsonWriter &) const
 Serializes the source-file preview to JSON.
 
void variables_from_JSON (const Json *)
 Restores the variables vector from JSON.
 
void samples_from_JSON (const Json *)
 Restores the per-sample roles from JSON.
 
void missing_values_from_JSON (const Json *)
 Restores the missing-value statistics from JSON.
 
void preview_data_from_JSON (const Json *)
 Restores the source-file preview from JSON.
 

Protected Attributes

MatrixR data
 Dense data matrix [samples x variables].
 
Shape input_shape
 Shape of the input portion (rank may exceed 1 for image/sequence data).
 
Shape target_shape
 Shape of the target portion.
 
Shape decoder_shape
 Shape of the decoder portion (transformer-style models).
 
vector< SampleRolesample_roles
 Per-sample role (Training/Validation/Testing/None).
 
vector< string > sample_ids
 Optional per-sample identifiers (when the source file has an id column).
 
vector< Variablevariables
 Per-variable metadata (name, role, type, scaler).
 
filesystem::path data_path
 Path to the source data file.
 
Separator separator = Separator::Comma
 Field separator used in the source file.
 
string missing_values_label = "NA"
 Label that marks missing values in the source file.
 
bool has_header = false
 Whether the source file's first row contains column names.
 
bool has_sample_ids = false
 Whether the source file's first column contains sample identifiers.
 
Codification codification = Codification::UTF8
 Source-file character encoding.
 
vector< vector< string > > data_file_preview
 Cached preview of the first rows of the source file.
 
Index gmt = 0
 GMT offset for time variables, in hours.
 
MissingValuesMethod missing_values_method = MissingValuesMethod::Mean
 Strategy used to handle missing values.
 
Index missing_values_number = 0
 Total number of missing cells.
 
VectorI variables_missing_values_number
 Per-variable missing-value count.
 
Index rows_missing_values_number = 0
 Number of rows that contain at least one missing value.
 
bool display = true
 Whether to print progress messages.
 
const vector< string > positive_words = {"1", "yes", "positive", "+", "true", "good", "si", "sí", "Sí"}
 Strings interpreted as positive when parsing binary variables.
 
const vector< string > negative_words = {"0", "no", "negative", "-", "false", "bad", "not", "No"}
 Strings interpreted as negative when parsing binary variables.
 

Detailed Description

Base data container with samples, variables and per-variable metadata.

Owns a dense matrix of values (rows are samples, columns are variables) plus parallel metadata: per-sample role, per-variable Variable description (role, type, scaler), input/target shapes, missing-value handling, source file metadata.

Provides utilities for loading from CSV, splitting into Training / Validation / Testing partitions, scaling and unscaling features, descriptive statistics, correlation analysis and Tukey-based outlier handling.

Specialized data formats are implemented by deriving from this class: TabularDataset, ImageDataset, LanguageDataset, TimeSeriesDataset.

Member Enumeration Documentation

◆ Codification

enum class opennn::Dataset::Codification
strong

Source-file character encoding.

Enumerator
UTF8 
SHIFT_JIS 

◆ MissingValuesMethod

Strategy for replacing missing values.

Enumerator
Unuse 
Mean 
Median 
Interpolation 

◆ Separator

enum class opennn::Dataset::Separator
strong

Field-separator type for tabular files.

Enumerator
Space 
Tab 
Comma 
Semicolon 

Constructor & Destructor Documentation

◆ Dataset() [1/2]

opennn::Dataset::Dataset ( const Index samples_number = 0,
const Shape & input_shape = {0},
const Shape & target_shape = {0} )

Constructs an empty dataset of given dimensions.

Parameters
samples_numberNumber of samples (rows).
input_shapeShape of the input portion (columns assigned the "Input" role).
target_shapeShape of the target portion (columns assigned the "Target" role).

◆ Dataset() [2/2]

opennn::Dataset::Dataset ( const filesystem::path & data_path,
const string & separator,
bool has_header = true,
bool has_ids = false,
const Codification & codification = Codification::UTF8 )

Constructs a dataset by loading a delimited text file.

Parameters
data_pathPath to the source file.
separatorField separator string ("," ";" "\t" or " ").
has_headerWhether the first row contains column names.
has_idsWhether the first column contains sample identifiers.
codificationSource-file character encoding.

Member Function Documentation

◆ augment_inputs()

virtual void opennn::Dataset::augment_inputs ( float * buffer,
Index batch_size ) const
inlinevirtual

Optionally augments inputs in-place after fill_inputs() (e.g. random crops).

Default implementation is a no-op; subclasses override.

Parameters
bufferBuffer produced by fill_inputs().
batch_sizeNumber of samples in the buffer.

Reimplemented in opennn::ImageDataset.

◆ calculate_correlations_rank()

VectorI opennn::Dataset::calculate_correlations_rank ( ) const

Returns the rank of every input variable by absolute Pearson correlation against targets.

Returns
Vector of ranks aligned to input variables.

◆ calculate_feature_descriptives() [1/2]

vector< Descriptives > opennn::Dataset::calculate_feature_descriptives ( ) const

Computes descriptive statistics for every feature.

Returns
One Descriptives entry per feature.

◆ calculate_feature_descriptives() [2/2]

vector< Descriptives > opennn::Dataset::calculate_feature_descriptives ( const string & role_name) const

Computes feature descriptives for a single role.

Parameters
role_nameVariable role.
Returns
One Descriptives entry per feature in the role.

◆ calculate_input_target_variable_correlations()

Tensor< Correlation, 2 > opennn::Dataset::calculate_input_target_variable_correlations ( Correlation(* correlation_function )(const MatrixR &, const MatrixR &),
const string & samples_role ) const

Computes a custom correlation between inputs and targets.

Parameters
correlation_functionFunction used to compute the pair correlation.
samples_roleSample-role filter.
Returns
Correlation matrix [inputs x targets].

◆ calculate_input_target_variable_pearson_correlations()

Tensor< Correlation, 2 > opennn::Dataset::calculate_input_target_variable_pearson_correlations ( ) const

Computes Pearson correlations between inputs and targets.

Returns
Correlation matrix [inputs x targets].

◆ calculate_input_target_variable_spearman_correlations()

Tensor< Correlation, 2 > opennn::Dataset::calculate_input_target_variable_spearman_correlations ( ) const

Computes Spearman rank correlations between inputs and targets.

Returns
Correlation matrix [inputs x targets].

◆ calculate_input_variable_correlations()

Tensor< Correlation, 2 > opennn::Dataset::calculate_input_variable_correlations ( Correlation(* correlation_function )(const MatrixR &, const MatrixR &),
Correlation::Method method,
const string & samples_role ) const

Computes a custom correlation between every pair of input variables.

Parameters
correlation_functionFunction used to compute the pair correlation.
methodCorrelation method (Pearson, Spearman, ...).
samples_roleSample-role filter.
Returns
Symmetric Correlation matrix [inputs x inputs].

◆ calculate_input_variable_pearson_correlations()

Tensor< Correlation, 2 > opennn::Dataset::calculate_input_variable_pearson_correlations ( ) const

Computes Pearson correlations between every pair of input variables.

Returns
Symmetric Correlation matrix [inputs x inputs].

◆ calculate_input_variable_spearman_correlations()

Tensor< Correlation, 2 > opennn::Dataset::calculate_input_variable_spearman_correlations ( ) const

Computes Spearman rank correlations between every pair of input variables.

Returns
Symmetric Correlation matrix [inputs x inputs].

◆ calculate_missing_values_statistics()

void opennn::Dataset::calculate_missing_values_statistics ( )

Updates the cached missing-value statistics (counts, indices).

◆ calculate_target_distribution()

VectorI opennn::Dataset::calculate_target_distribution ( ) const

Counts the samples of every target class.

Returns
Vector with one count per class.

◆ calculate_Tukey_outliers()

vector< vector< Index > > opennn::Dataset::calculate_Tukey_outliers ( const float tukey_factor = 1.5f,
bool replace = false )

Detects Tukey outliers per variable.

Parameters
tukey_factorTukey-fence multiplier (1.5 = mild, 3.0 = extreme).
replaceWhether to also replace the outliers with NaN.
Returns
Outer vector indexed by variable, inner by outlier sample index.

◆ calculate_variable_descriptives_categories()

vector< Descriptives > opennn::Dataset::calculate_variable_descriptives_categories ( const Index variable_index) const

Computes descriptives per category of a categorical variable.

Parameters
variable_indexColumn index of the categorical variable.
Returns
One Descriptives entry per (input variable, category) pair.

◆ calculate_variable_descriptives_negative_samples()

vector< Descriptives > opennn::Dataset::calculate_variable_descriptives_negative_samples ( ) const

Computes descriptives for inputs restricted to negative-target samples.

Returns
One Descriptives entry per input variable.

◆ calculate_variable_descriptives_positive_samples()

vector< Descriptives > opennn::Dataset::calculate_variable_descriptives_positive_samples ( ) const

Computes descriptives for inputs restricted to positive-target samples.

Returns
One Descriptives entry per input variable.

◆ calculate_variable_distributions()

vector< Histogram > opennn::Dataset::calculate_variable_distributions ( const Index bins_number = 10) const

Builds histograms of every variable.

Parameters
bins_numberNumber of bins per histogram.
Returns
One Histogram per variable.

◆ calculate_variables_box_plots()

vector< BoxPlot > opennn::Dataset::calculate_variables_box_plots ( ) const

Computes box-plot statistics for every variable.

Returns
One BoxPlot per variable.

◆ count_nan()

Index opennn::Dataset::count_nan ( ) const

Counts the total number of NaN cells.

Returns
Total NaN count.

◆ count_nans_per_variable()

VectorI opennn::Dataset::count_nans_per_variable ( ) const

Counts NaN cells per variable.

Returns
Vector with one count per variable.

◆ count_rows_with_nan()

Index opennn::Dataset::count_rows_with_nan ( ) const

Counts samples that contain at least one NaN.

Returns
Number of affected rows.

◆ count_variables_with_nan()

Index opennn::Dataset::count_variables_with_nan ( ) const

Counts variables that contain at least one NaN.

Returns
Number of affected variables.

◆ fill_decoder()

virtual void opennn::Dataset::fill_decoder ( const vector< Index > & sample_indices,
const vector< Index > & feature_indices,
float * buffer,
bool transpose = true,
int contiguous = -1 ) const
virtual

Fills a contiguous buffer with decoder-side inputs (transformer-style models).

Parameters
sample_indicesRow indices for the batch.
feature_indicesColumn indices for the decoder features.
bufferDestination buffer.
transposeWhether to write feature-major or sample-major.
contiguousStride hint.

◆ fill_inputs()

virtual void opennn::Dataset::fill_inputs ( const vector< Index > & sample_indices,
const vector< Index > & feature_indices,
float * buffer,
bool transpose = true,
int contiguous = -1 ) const
virtual

Fills a contiguous float buffer with input data for a batch of samples.

Used by Loss/Optimizer to build forward-pass inputs without reallocating.

Parameters
sample_indicesRow indices for the batch.
feature_indicesColumn indices for the input features.
bufferDestination buffer; must hold sample_indices.size() * feature_indices.size() floats.
transposeWhether to write feature-major (true) or sample-major (false).
contiguousStride hint when sample_indices is a contiguous range; -1 disables.

Reimplemented in opennn::TimeSeriesDataset.

◆ fill_targets()

virtual void opennn::Dataset::fill_targets ( const vector< Index > & sample_indices,
const vector< Index > & feature_indices,
float * buffer,
bool transpose = true,
int contiguous = -1 ) const
virtual

Fills a contiguous buffer with target data for a batch of samples.

Parameters
sample_indicesRow indices for the batch.
feature_indicesColumn indices for the target features.
bufferDestination buffer.
transposeWhether to write feature-major or sample-major.
contiguousStride hint.

Reimplemented in opennn::TimeSeriesDataset.

◆ from_JSON()

virtual void opennn::Dataset::from_JSON ( const JsonDocument & document)
virtual

Restores the dataset state from a JSON document.

Parameters
documentParsed JSON produced by to_JSON().

Reimplemented in opennn::ImageDataset, opennn::LanguageDataset, and opennn::TimeSeriesDataset.

◆ get_batches()

virtual void opennn::Dataset::get_batches ( const vector< Index > & sample_indices,
Index batch_size,
bool shuffle,
vector< vector< Index > > & batches ) const
virtual

Splits a list of sample indices into batches.

Optionally shuffles the indices before batching.

Parameters
sample_indicesSample indices to batch.
batch_sizeTarget batch size.
shuffleWhether to shuffle indices.
batchesOutput vector of batches; populated in place.

◆ get_codification()

const Codification & opennn::Dataset::get_codification ( ) const
inline

Returns the configured source-file codification.

Returns
Codification enum value.

◆ get_codification_string()

const string opennn::Dataset::get_codification_string ( ) const

Returns the codification as a string.

Returns
One of "UTF8", "SHIFT_JIS".

◆ get_data() [1/2]

const MatrixR & opennn::Dataset::get_data ( ) const
inline

Returns the raw data matrix.

Returns
Const reference to the [samples x variables] matrix.

◆ get_data() [2/2]

MatrixR opennn::Dataset::get_data ( const string & sample_role,
const string & variable_role ) const

Returns the data restricted to a sample-role and variable-role intersection.

Parameters
sample_roleSample role.
variable_roleVariable role.
Returns
Matrix [samples_number(sample_role) x features_number(variable_role)].

◆ get_data_file_preview()

const vector< vector< string > > & opennn::Dataset::get_data_file_preview ( ) const
inline

Returns the cached preview of the source file (first rows).

Returns
Const reference to the preview rows.

◆ get_data_from_indices()

MatrixR opennn::Dataset::get_data_from_indices ( const vector< Index > & sample_indices,
const vector< Index > & variable_indices ) const

Returns the data restricted to specific samples and variables.

Parameters
sample_indicesRow indices.
variable_indicesColumn indices.
Returns
Submatrix in row-major order.

◆ get_data_path()

const filesystem::path & opennn::Dataset::get_data_path ( ) const
inline

Returns the path to the source data file.

Returns
Const reference to the data path.

◆ get_display()

bool opennn::Dataset::get_display ( ) const
inline

Reports whether progress messages are printed.

Returns
true when display is on.

◆ get_feature_data()

MatrixR opennn::Dataset::get_feature_data ( const string & role_name) const

Returns the data matrix restricted to the features of a given role.

Parameters
role_nameVariable role.
Returns
Matrix [used_samples x features_number(role)].

◆ get_feature_dimensions()

vector< Index > opennn::Dataset::get_feature_dimensions ( ) const

Returns the per-variable feature dimension (1 for Numeric, N for Categorical).

Returns
Vector of feature dimensions.

◆ get_feature_indices() [1/3]

vector< vector< Index > > opennn::Dataset::get_feature_indices ( ) const

Returns the per-variable feature indices.

Returns
Outer vector indexed by variable, inner by feature within that variable.

◆ get_feature_indices() [2/3]

vector< Index > opennn::Dataset::get_feature_indices ( const Index variable_index) const

Returns the feature indices for a single variable.

Parameters
variable_indexColumn index of the variable.
Returns
Feature indices generated by that variable.

◆ get_feature_indices() [3/3]

vector< Index > opennn::Dataset::get_feature_indices ( const string & role_name) const

Returns the feature indices for variables of a given role.

Parameters
role_nameVariable role.
Returns
Vector of feature indices.

◆ get_feature_names() [1/2]

vector< string > opennn::Dataset::get_feature_names ( ) const

Returns the names of every feature.

Returns
Vector of feature names.

◆ get_feature_names() [2/2]

vector< string > opennn::Dataset::get_feature_names ( const string & role_name) const

Returns the names of the features assigned to a given role.

Parameters
role_nameVariable role.
Returns
Vector of feature names matching the role.

◆ get_feature_scalers()

vector< string > opennn::Dataset::get_feature_scalers ( const string & role_name) const

Returns the scaler chosen for each variable of a given role.

Parameters
role_nameVariable role.
Returns
Vector of scaler names ("MinimumMaximum", "MeanStandardDeviation", ...).

◆ get_features_number() [1/2]

Index opennn::Dataset::get_features_number ( ) const

Returns the total number of features.

One categorical variable expands into N features (one per class), so features_number >= variables_number in general.

Returns
Feature count summed across variables.

◆ get_features_number() [2/2]

Index opennn::Dataset::get_features_number ( const string & role_name) const

Returns the number of features assigned to a given role.

Parameters
role_nameVariable role.
Returns
Feature count for the role.

◆ get_input_shape()

Shape opennn::Dataset::get_input_shape ( ) const
inline

Returns the input shape.

Returns
Shape used to construct the network's first layer.

◆ get_missing_values_label()

const string & opennn::Dataset::get_missing_values_label ( ) const
inline

Returns the label that marks missing values in the source file.

Returns
Configured missing-value label (defaults to "NA").

◆ get_missing_values_method()

MissingValuesMethod opennn::Dataset::get_missing_values_method ( ) const
inline

Returns the configured missing-value strategy.

Returns
MissingValuesMethod enum value.

◆ get_missing_values_method_string()

string opennn::Dataset::get_missing_values_method_string ( ) const

Returns the missing-value strategy as a string.

Returns
One of "Unuse", "Mean", "Median", "Interpolation".

◆ get_missing_values_number()

Index opennn::Dataset::get_missing_values_number ( ) const
inline

Returns the total number of cells flagged as missing.

Returns
Missing-value count.

◆ get_sample_data()

VectorR opennn::Dataset::get_sample_data ( const Index sample_index) const

Returns a single sample as a row vector.

Parameters
sample_indexRow index.
Returns
Vector with all variable values for that sample.

◆ get_sample_indices()

vector< Index > opennn::Dataset::get_sample_indices ( const string & role_name) const

Returns the indices of the samples assigned to a given role.

Parameters
role_name"Training", "Validation", "Testing" or "None".
Returns
Vector of sample indices for the role.

◆ get_sample_role_numbers()

VectorI opennn::Dataset::get_sample_role_numbers ( ) const

Counts the samples assigned to each role.

Returns
Length-4 vector: [Training, Validation, Testing, None] counts.

◆ get_sample_roles()

const vector< SampleRole > & opennn::Dataset::get_sample_roles ( ) const
inline

Returns the per-sample role assignments.

Returns
Const reference to the role vector.

◆ get_sample_roles_vector()

vector< Index > opennn::Dataset::get_sample_roles_vector ( ) const

Returns the per-sample role indices as plain integers.

Returns
Vector with the integer encoding of each role.

◆ get_samples_number() [1/2]

Index opennn::Dataset::get_samples_number ( ) const
inline

Returns the total number of samples (rows of data).

Returns
Sample count.

◆ get_samples_number() [2/2]

Index opennn::Dataset::get_samples_number ( const string & role_name) const

Returns the number of samples assigned to a given role.

Parameters
role_name"Training", "Validation", "Testing" or "None".
Returns
Sample count for the role.

◆ get_separator()

const Separator & opennn::Dataset::get_separator ( ) const
inline

Returns the configured field separator.

Returns
Separator enum value.

◆ get_separator_name()

string opennn::Dataset::get_separator_name ( ) const

Returns the field separator as a human-readable name.

Returns
Separator name ("Comma", "Semicolon", "Tab", "Space").

◆ get_separator_string()

string opennn::Dataset::get_separator_string ( ) const

Returns the field separator as the actual delimiter character(s).

Returns
Separator string ("," ";" "\t" or " ").

◆ get_shape()

Shape opennn::Dataset::get_shape ( const string & role_name) const

Returns the input or target shape used by the network.

Parameters
role_name"Input" or "Target".
Returns
Shape with the number of features for that role.

◆ get_target_shape()

Shape opennn::Dataset::get_target_shape ( ) const
inline

Returns the target shape.

Returns
Shape used to construct the network's output layer.

◆ get_used_feature_indices()

vector< Index > opennn::Dataset::get_used_feature_indices ( ) const

Returns the feature indices for all variables that are not "Unused".

Returns
Vector of used feature indices.

◆ get_used_features_number()

Index opennn::Dataset::get_used_features_number ( ) const

Returns the number of features that are not "Unused".

Returns
Used feature count.

◆ get_used_sample_indices()

vector< Index > opennn::Dataset::get_used_sample_indices ( ) const

Returns the indices of all samples that are not "None".

Returns
Vector of used sample indices.

◆ get_used_samples_number()

Index opennn::Dataset::get_used_samples_number ( ) const

Returns the number of samples that are not "None".

Returns
Used sample count.

◆ get_used_variables_indices()

vector< Index > opennn::Dataset::get_used_variables_indices ( ) const

Returns the column indices of all variables that are not "Unused".

Returns
Vector of used column indices.

◆ get_used_variables_number()

Index opennn::Dataset::get_used_variables_number ( ) const

Returns the number of variables that are not "Unused".

Returns
Used variable count.

◆ get_variable_data() [1/3]

MatrixR opennn::Dataset::get_variable_data ( const Index variable_index) const

Returns the data for a single variable across all samples.

Parameters
variable_indexColumn index.
Returns
Column matrix.

◆ get_variable_data() [2/3]

MatrixR opennn::Dataset::get_variable_data ( const Index variable_index,
const vector< Index > & sample_indices ) const

Returns the data for a single variable on a subset of samples.

Parameters
variable_indexColumn index.
sample_indicesRow indices.
Returns
Column matrix restricted to sample_indices.

◆ get_variable_data() [3/3]

MatrixR opennn::Dataset::get_variable_data ( const string & variable_name) const

Returns the data for a single variable identified by name.

Parameters
variable_nameVariable name.
Returns
Column matrix.

◆ get_variable_index() [1/2]

Index opennn::Dataset::get_variable_index ( const Index id) const

Returns the column index of the variable with a given numeric id.

Parameters
idVariable id.
Returns
Column index in data.

◆ get_variable_index() [2/2]

Index opennn::Dataset::get_variable_index ( const string & name) const

Returns the column index of the variable with a given name.

Parameters
nameVariable name.
Returns
Column index in data.

◆ get_variable_indices()

vector< Index > opennn::Dataset::get_variable_indices ( const string & role_name) const

Returns the column indices of the variables assigned to a given role.

Parameters
role_nameVariable role.
Returns
Vector of column indices.

◆ get_variable_names() [1/2]

vector< string > opennn::Dataset::get_variable_names ( ) const

Returns the names of every variable.

Returns
Vector of variable names.

◆ get_variable_names() [2/2]

vector< string > opennn::Dataset::get_variable_names ( const string & role_name) const

Returns the names of the variables assigned to a given role.

Parameters
role_nameVariable role.
Returns
Vector of variable names matching the role.

◆ get_variable_type()

VariableType opennn::Dataset::get_variable_type ( const Index index) const
inline

Returns the type of a variable (Numeric, Binary, Categorical, ...).

Parameters
indexColumn index.
Returns
Variable type.

◆ get_variable_types()

vector< VariableType > opennn::Dataset::get_variable_types ( const vector< Index > indices) const

Returns the types of a list of variables.

Parameters
indicesColumn indices.
Returns
Variable types in the same order as indices.

◆ get_variables() [1/2]

const vector< Variable > & opennn::Dataset::get_variables ( ) const
inline

Returns the per-variable metadata.

Returns
Const reference to the variables vector.

◆ get_variables() [2/2]

vector< Variable > opennn::Dataset::get_variables ( const string & role_name) const

Returns the variables assigned to a given role.

Parameters
role_nameVariable role.
Returns
Vector of matching Variable entries.

◆ get_variables_number() [1/2]

Index opennn::Dataset::get_variables_number ( ) const
inline

Returns the total number of variables (columns of data).

Returns
Variable count.

◆ get_variables_number() [2/2]

Index opennn::Dataset::get_variables_number ( const string & role_name) const

Returns the number of variables assigned to a given role.

Parameters
role_nameVariable role ("Input", "Target", "Time", "Unused", "Decoder").
Returns
Variable count for the role.

◆ has_binary_or_categorical_variables()

bool opennn::Dataset::has_binary_or_categorical_variables ( ) const

Reports whether the dataset has any binary or categorical variable.

Returns
true when at least one variable is Binary or Categorical.

◆ has_binary_variables()

bool opennn::Dataset::has_binary_variables ( ) const

Reports whether at least one variable is binary.

Returns
true when any variable has type Binary.

◆ has_categorical_variables()

bool opennn::Dataset::has_categorical_variables ( ) const

Reports whether at least one variable is categorical.

Returns
true when any variable has type Categorical.

◆ has_missing_values()

bool opennn::Dataset::has_missing_values ( const vector< string > & labels) const

Reports whether the dataset has missing values matching any of the supplied labels.

Parameters
labelsCandidate missing-value labels.
Returns
true when any sample contains one of the labels.

◆ has_nan()

bool opennn::Dataset::has_nan ( ) const

Reports whether the data matrix contains any NaN.

Returns
true when at least one cell is NaN.

◆ has_nan_row()

bool opennn::Dataset::has_nan_row ( const Index row_index) const

Reports whether a row contains any NaN.

Parameters
row_indexRow to inspect.
Returns
true when at least one cell of row_index is NaN.

◆ has_time_variable()

bool opennn::Dataset::has_time_variable ( ) const

Reports whether at least one variable plays the Time role.

Returns
true when a Time variable exists.

◆ has_validation()

bool opennn::Dataset::has_validation ( ) const

Reports whether at least one sample is assigned to Validation.

Returns
true when validation samples exist.

◆ impute_missing_values_interpolate()

virtual void opennn::Dataset::impute_missing_values_interpolate ( )
virtual

Replaces missing values via linear interpolation along each variable.

Reimplemented in opennn::TimeSeriesDataset.

◆ impute_missing_values_statistic()

void opennn::Dataset::impute_missing_values_statistic ( const MissingValuesMethod & method)

Replaces missing values with a per-variable statistic.

Parameters
methodMissingValuesMethod (Mean, Median).

◆ impute_missing_values_unuse()

virtual void opennn::Dataset::impute_missing_values_unuse ( )
virtual

Marks samples with missing values as Unused.

Reimplemented in opennn::TimeSeriesDataset.

◆ infer_dataset_date_format()

DateFormat opennn::Dataset::infer_dataset_date_format ( const vector< Variable > & variables,
const vector< vector< string > > & data_file_preview,
bool has_header,
const string & missing_values_label )

Infers the date format used by date-typed variables in the source file.

Parameters
variablesVariables to inspect.
data_file_previewPreview rows from the source file.
has_headerWhether the preview includes a header row.
missing_values_labelMissing-value label to ignore.
Returns
Inferred date format.

◆ is_empty()

bool opennn::Dataset::is_empty ( ) const
inline

Reports whether the data matrix is empty.

Returns
true when data has zero elements.

◆ is_sample_used()

bool opennn::Dataset::is_sample_used ( const Index i) const
inline

Reports whether a sample is used (any role other than None).

Parameters
iSample index.
Returns
true when the sample is used.

◆ load()

void opennn::Dataset::load ( const filesystem::path & file_name)

Loads the dataset state from a JSON file on disk.

Parameters
file_nameSource path.

◆ load_data_binary()

void opennn::Dataset::load_data_binary ( )

Loads the data matrix from a binary file produced by save_data_binary().

◆ missing_values_from_JSON()

void opennn::Dataset::missing_values_from_JSON ( const Json * )
protected

Restores the missing-value statistics from JSON.

◆ missing_values_to_JSON()

void opennn::Dataset::missing_values_to_JSON ( JsonWriter & ) const
protected

Serializes the missing-value statistics to JSON.

◆ preview_data_from_JSON()

void opennn::Dataset::preview_data_from_JSON ( const Json * )
protected

Restores the source-file preview from JSON.

◆ preview_data_to_JSON()

void opennn::Dataset::preview_data_to_JSON ( JsonWriter & ) const
protected

Serializes the source-file preview to JSON.

◆ read_csv()

virtual void opennn::Dataset::read_csv ( )
virtual

Reads the configured CSV file into the data matrix.

Reimplemented in opennn::LanguageDataset, and opennn::TimeSeriesDataset.

◆ replace_Tukey_outliers_with_NaN()

vector< vector< Index > > opennn::Dataset::replace_Tukey_outliers_with_NaN ( const float tukey_factor = 1.5f)

Detects Tukey outliers and replaces them with NaN.

Parameters
tukey_factorTukey-fence multiplier.
Returns
Outer vector indexed by variable, inner by sample index that was replaced.

◆ samples_from_JSON()

void opennn::Dataset::samples_from_JSON ( const Json * )
protected

Restores the per-sample roles from JSON.

◆ samples_to_JSON()

void opennn::Dataset::samples_to_JSON ( JsonWriter & ) const
protected

Serializes the per-sample roles to JSON.

◆ save()

void opennn::Dataset::save ( const filesystem::path & file_name) const

Saves the dataset state to a JSON file on disk.

Parameters
file_nameDestination path.

◆ save_data()

void opennn::Dataset::save_data ( ) const

Saves the data matrix back to the configured source file.

◆ save_data_binary()

void opennn::Dataset::save_data_binary ( const filesystem::path & file_name) const

Saves the data matrix as a binary file.

Parameters
file_nameDestination path.

◆ scale_data()

vector< Descriptives > opennn::Dataset::scale_data ( )

Scales the entire data matrix.

Returns
One Descriptives entry per variable, computed before scaling.

◆ scale_features()

virtual vector< Descriptives > opennn::Dataset::scale_features ( const string & role_name)
virtual

Scales the features of a given role.

Reads the scalers from the per-variable metadata, computes the descriptives of the unscaled data, scales the data in place and returns the descriptives so they can be reused (for inverse-scaling outputs or configuring the network's Scaling layer).

Parameters
role_nameVariable role to scale (typically "Input").
Returns
One Descriptives entry per feature in the role.

Reimplemented in opennn::ImageDataset.

◆ scrub_missing_values()

void opennn::Dataset::scrub_missing_values ( )

Removes samples that contain missing values.

◆ set() [1/3]

void opennn::Dataset::set ( const filesystem::path & data_path,
const string & separator,
bool has_header = true,
bool has_ids = false,
const Dataset::Codification & codification = Codification::UTF8 )

Resets the dataset by loading a delimited text file.

Parameters
data_pathPath to the source file.
separatorField separator string.
has_headerWhether the first row contains column names.
has_idsWhether the first column contains sample identifiers.
codificationSource-file character encoding.

◆ set() [2/3]

void opennn::Dataset::set ( const filesystem::path & file_name)

Resets the dataset by loading a previously serialized JSON state.

Parameters
file_namePath to the JSON file.

◆ set() [3/3]

void opennn::Dataset::set ( const Index samples_number = 0,
const Shape & input_shape = {},
const Shape & target_shape = {} )

Resets the dataset to a synthetic shape.

Parameters
samples_numberNumber of samples.
input_shapeShape of the input portion.
target_shapeShape of the target portion.

◆ set_binary_variables()

void opennn::Dataset::set_binary_variables ( )

Detects binary variables (two distinct values) and tags them accordingly.

◆ set_codification() [1/2]

void opennn::Dataset::set_codification ( const Codification & new_codification)
inline

Sets the source-file codification.

Parameters
new_codificationCodification enum value.

◆ set_codification() [2/2]

void opennn::Dataset::set_codification ( const string & new_codification)

Sets the source-file codification from its name.

Parameters
new_codification"UTF8" or "SHIFT_JIS".

◆ set_data()

void opennn::Dataset::set_data ( const MatrixR & new_data)

Replaces the data matrix.

The number of rows and columns must match the existing samples and variables count.

Parameters
new_dataReplacement matrix.

◆ set_data_binary_classification()

void opennn::Dataset::set_data_binary_classification ( )

Fills the data matrix with synthetic binary-classification data.

◆ set_data_constant()

void opennn::Dataset::set_data_constant ( const float value)

Fills the data matrix with a constant value.

Parameters
valueValue used to fill every cell.

◆ set_data_integer()

virtual void opennn::Dataset::set_data_integer ( const Index vocabulary_size)
virtual

Fills the data matrix with random integers in [0, vocabulary_size).

Parameters
vocabulary_sizeExclusive upper bound for the integers.

◆ set_data_path()

void opennn::Dataset::set_data_path ( const filesystem::path & new_data_path)
inline

Sets the path to the source data file.

Parameters
new_data_pathNew path.

◆ set_data_random()

virtual void opennn::Dataset::set_data_random ( )
virtual

Fills the data matrix with uniform random values.

Reimplemented in opennn::ImageDataset.

◆ set_data_rosenbrock()

void opennn::Dataset::set_data_rosenbrock ( )

Fills the data matrix with samples from the Rosenbrock function.

◆ set_default()

void opennn::Dataset::set_default ( )

Resets configuration members to defaults.

◆ set_default_variable_names()

void opennn::Dataset::set_default_variable_names ( )

Sets default names ("variable_1", "variable_2", ...) for every variable.

◆ set_default_variable_roles()

void opennn::Dataset::set_default_variable_roles ( )
protected

Sets default Input/Target roles based on column position (last column = target).

◆ set_default_variable_roles_forecasting()

void opennn::Dataset::set_default_variable_roles_forecasting ( )
protected

Sets default roles for forecasting (typical pattern: lagged inputs + future target).

◆ set_default_variable_scalers()

void opennn::Dataset::set_default_variable_scalers ( )

Picks default scalers for every variable based on its type.

◆ set_display()

void opennn::Dataset::set_display ( bool new_display)
inline

Toggles progress messages.

Parameters
new_displaytrue to enable.

◆ set_feature_names()

void opennn::Dataset::set_feature_names ( const vector< string > & new_feature_names)

Names every feature.

Each categorical variable contributes one name per class.

Parameters
new_feature_namesOne name per feature.

◆ set_gmt()

void opennn::Dataset::set_gmt ( const Index new_gmt)
inline

Sets the GMT offset for time variables.

Parameters
new_gmtOffset in hours.

◆ set_has_header()

void opennn::Dataset::set_has_header ( bool new_has_header)
inline

Sets whether the source file has a header row.

Parameters
new_has_headertrue if the first row contains names.

◆ set_has_ids()

void opennn::Dataset::set_has_ids ( bool new_has_ids)
inline

Sets whether the source file has a sample-id column.

Parameters
new_has_idstrue if the first column contains identifiers.

◆ set_input_variables_unused()

void opennn::Dataset::set_input_variables_unused ( )

Marks all input variables as Unused.

◆ set_missing_values_label()

void opennn::Dataset::set_missing_values_label ( string label)
inline

Sets the label used for missing values in the source file.

Parameters
labelNew label.

◆ set_missing_values_method() [1/2]

void opennn::Dataset::set_missing_values_method ( const MissingValuesMethod & method)
inline

Sets the missing-value handling strategy.

Parameters
methodMissingValuesMethod enum value.

◆ set_missing_values_method() [2/2]

void opennn::Dataset::set_missing_values_method ( const string & method_name)

Sets the missing-value handling strategy from its name.

Parameters
method_name"Unuse", "Mean", "Median" or "Interpolation".

◆ set_sample_role()

void opennn::Dataset::set_sample_role ( const Index sample_index,
const string & role_name )

Assigns a role to a single sample.

Parameters
sample_indexRow index.
role_nameSample role.

◆ set_sample_roles() [1/3]

void opennn::Dataset::set_sample_roles ( const string & role_name)

Assigns the same role to every sample.

Parameters
role_nameSample role.

◆ set_sample_roles() [2/3]

void opennn::Dataset::set_sample_roles ( const vector< Index > & sample_indices,
const string & role_name )

Assigns the same role to a list of samples.

Parameters
sample_indicesIndices to update.
role_nameSample role.

◆ set_sample_roles() [3/3]

void opennn::Dataset::set_sample_roles ( const vector< string > & role_names)

Assigns roles to all samples from a parallel string vector.

Parameters
role_namesOne name per sample.

◆ set_separator()

void opennn::Dataset::set_separator ( const Separator & new_separator)
inline

Sets the field separator.

Parameters
new_separatorSeparator enum value.

◆ set_separator_name()

void opennn::Dataset::set_separator_name ( const string & new_separator_name)

Sets the field separator from its human-readable name.

Parameters
new_separator_name"Comma", "Semicolon", "Tab" or "Space".

◆ set_separator_string()

void opennn::Dataset::set_separator_string ( const string & new_separator_string)

Sets the field separator from its delimiter character(s).

Parameters
new_separator_stringDelimiter ("," ";" "\t" or " ").

◆ set_shape()

void opennn::Dataset::set_shape ( const string & role_name,
const Shape & new_shape )

Sets the input or target shape.

Parameters
role_name"Input" or "Target".
new_shapeNew shape.

◆ set_variable_indices()

void opennn::Dataset::set_variable_indices ( const vector< Index > & input_indices,
const vector< Index > & target_indices )

Marks selected variables as Input and others as Target.

Parameters
input_indicesIndices of input variables.
target_indicesIndices of target variables.

◆ set_variable_names()

void opennn::Dataset::set_variable_names ( const vector< string > & new_variable_names)

Replaces the names of every variable.

Parameters
new_variable_namesOne name per variable.

◆ set_variable_role() [1/2]

void opennn::Dataset::set_variable_role ( const Index variable_index,
const string & role_name )

Sets the role of a single variable by index.

Parameters
variable_indexColumn index.
role_nameNew role.

◆ set_variable_role() [2/2]

void opennn::Dataset::set_variable_role ( const string & variable_name,
const string & role_name )

Sets the role of a single variable by name.

Parameters
variable_nameVariable name.
role_nameNew role.

◆ set_variable_roles() [1/2]

void opennn::Dataset::set_variable_roles ( const string & role_name)

Assigns the same role to every variable.

Parameters
role_nameVariable role.

◆ set_variable_roles() [2/2]

virtual void opennn::Dataset::set_variable_roles ( const vector< string > & role_names)
virtual

Assigns roles to all variables from a parallel string vector.

Parameters
role_namesOne role per variable.

◆ set_variable_scalers() [1/2]

void opennn::Dataset::set_variable_scalers ( const string & scaler_name)

Sets the same scaler on every variable.

Parameters
scaler_nameScaler name.

◆ set_variable_scalers() [2/2]

void opennn::Dataset::set_variable_scalers ( const vector< string > & scaler_names)

Sets one scaler per variable.

Parameters
scaler_namesOne name per variable.

◆ set_variable_type() [1/2]

void opennn::Dataset::set_variable_type ( const Index variable_index,
const VariableType & type )

Sets the type of a single variable by index.

Parameters
variable_indexColumn index.
typeNew variable type.

◆ set_variable_type() [2/2]

void opennn::Dataset::set_variable_type ( const string & variable_name,
const VariableType & type )

Sets the type of a single variable by name.

Parameters
variable_nameVariable name.
typeNew variable type.

◆ set_variable_types()

void opennn::Dataset::set_variable_types ( const VariableType & type)

Sets every variable to a given type.

Parameters
typeVariable type to apply.

◆ set_variables() [1/2]

void opennn::Dataset::set_variables ( const string & description)

Re-creates the variables vector from an input/target shape descriptor.

Parameters
descriptionCompact descriptor parsed by the implementation.

◆ set_variables() [2/2]

void opennn::Dataset::set_variables ( const vector< Variable > & new_variables)
inline

Replaces the per-variable metadata.

Parameters
new_variablesNew variables vector.

◆ set_variables_number()

void opennn::Dataset::set_variables_number ( const Index new_size)
inline

Resizes the variables vector.

Parameters
new_sizeNew variable count.

◆ split_samples() [1/2]

void opennn::Dataset::split_samples ( const float training_ratio = 0.6f,
float selection_ratio = 0.2f,
float testing_ratio = 0.2f,
bool shuffle = true )

Splits samples into training/validation/testing partitions.

Convenience entry point that delegates to split_samples_random() (when shuffle is true) or split_samples_sequential() (otherwise).

Parameters
training_ratioFraction of samples for training.
selection_ratioFraction for validation.
testing_ratioFraction for testing.
shuffleWhether to shuffle the indices before splitting.

◆ split_samples() [2/2]

vector< vector< Index > > opennn::Dataset::split_samples ( const vector< Index > & indices,
Index parts_number ) const

Splits a list of sample indices into chunks of (roughly) equal size.

Parameters
indicesSample indices to split.
parts_numberNumber of chunks.
Returns
Outer vector indexed by chunk, inner by sample index.

◆ split_samples_random()

void opennn::Dataset::split_samples_random ( const float training_ratio = 0.6f,
float selection_ratio = 0.2f,
float testing_ratio = 0.2f )

Splits samples into partitions after random shuffling.

Parameters
training_ratioFraction of samples for training.
selection_ratioFraction for validation.
testing_ratioFraction for testing.

◆ split_samples_sequential()

void opennn::Dataset::split_samples_sequential ( const float training_ratio = 0.6f,
float selection_ratio = 0.2f,
float testing_ratio = 0.2f )

Splits samples into partitions in their original order.

Parameters
training_ratioFraction of samples for training.
selection_ratioFraction for validation.
testing_ratioFraction for testing.

◆ to_JSON()

virtual void opennn::Dataset::to_JSON ( JsonWriter & writer) const
virtual

Serializes the dataset state to JSON.

Parameters
writerJSON writer that receives the dataset tree.

Reimplemented in opennn::ImageDataset, opennn::LanguageDataset, and opennn::TimeSeriesDataset.

◆ unscale_features()

void opennn::Dataset::unscale_features ( const string & role_name,
const vector< Descriptives > & feature_descriptives )

Inverse-scales the features of a given role.

Parameters
role_nameVariable role to unscale.
feature_descriptivesDescriptives produced by scale_features().

◆ unuse_collinear_variables()

vector< string > opennn::Dataset::unuse_collinear_variables ( const float maximum_correlation = 0.95f)

Marks variables strongly correlated against another input as Unused.

Parameters
maximum_correlationThreshold; variables above it (with another input) are unused.
Returns
Names of the variables marked as unused.

◆ unuse_Tukey_outliers()

void opennn::Dataset::unuse_Tukey_outliers ( const float tukey_factor = 1.5f)

Marks Tukey-outlier samples as Unused.

Parameters
tukey_factorTukey-fence multiplier.

◆ unuse_uncorrelated_variables()

vector< string > opennn::Dataset::unuse_uncorrelated_variables ( const float minimum_correlation = 0.25f)

Marks variables with low correlation against the target as Unused.

Parameters
minimum_correlationThreshold; variables below it are unused.
Returns
Names of the variables marked as unused.

◆ variables_from_JSON()

void opennn::Dataset::variables_from_JSON ( const Json * )
protected

Restores the variables vector from JSON.

◆ variables_to_JSON()

void opennn::Dataset::variables_to_JSON ( JsonWriter & ) const
protected

Serializes the variables vector to JSON.

Member Data Documentation

◆ codification

Codification opennn::Dataset::codification = Codification::UTF8
protected

Source-file character encoding.

◆ data

MatrixR opennn::Dataset::data
protected

Dense data matrix [samples x variables].

◆ data_file_preview

vector<vector<string> > opennn::Dataset::data_file_preview
protected

Cached preview of the first rows of the source file.

◆ data_path

filesystem::path opennn::Dataset::data_path
protected

Path to the source data file.

◆ decoder_shape

Shape opennn::Dataset::decoder_shape
protected

Shape of the decoder portion (transformer-style models).

◆ display

bool opennn::Dataset::display = true
protected

Whether to print progress messages.

◆ gmt

Index opennn::Dataset::gmt = 0
protected

GMT offset for time variables, in hours.

◆ has_header

bool opennn::Dataset::has_header = false
protected

Whether the source file's first row contains column names.

◆ has_sample_ids

bool opennn::Dataset::has_sample_ids = false
protected

Whether the source file's first column contains sample identifiers.

◆ input_shape

Shape opennn::Dataset::input_shape
protected

Shape of the input portion (rank may exceed 1 for image/sequence data).

◆ missing_values_label

string opennn::Dataset::missing_values_label = "NA"
protected

Label that marks missing values in the source file.

◆ missing_values_method

MissingValuesMethod opennn::Dataset::missing_values_method = MissingValuesMethod::Mean
protected

Strategy used to handle missing values.

◆ missing_values_number

Index opennn::Dataset::missing_values_number = 0
protected

Total number of missing cells.

◆ negative_words

const vector<string> opennn::Dataset::negative_words = {"0", "no", "negative", "-", "false", "bad", "not", "No"}
protected

Strings interpreted as negative when parsing binary variables.

◆ positive_words

const vector<string> opennn::Dataset::positive_words = {"1", "yes", "positive", "+", "true", "good", "si", "sí", "Sí"}
protected

Strings interpreted as positive when parsing binary variables.

◆ rows_missing_values_number

Index opennn::Dataset::rows_missing_values_number = 0
protected

Number of rows that contain at least one missing value.

◆ sample_ids

vector<string> opennn::Dataset::sample_ids
protected

Optional per-sample identifiers (when the source file has an id column).

◆ sample_roles

vector<SampleRole> opennn::Dataset::sample_roles
protected

Per-sample role (Training/Validation/Testing/None).

◆ separator

Separator opennn::Dataset::separator = Separator::Comma
protected

Field separator used in the source file.

◆ target_shape

Shape opennn::Dataset::target_shape
protected

Shape of the target portion.

◆ variables

vector<Variable> opennn::Dataset::variables
protected

Per-variable metadata (name, role, type, scaler).

◆ variables_missing_values_number

VectorI opennn::Dataset::variables_missing_values_number
protected

Per-variable missing-value count.