OpenNN
Open-source neural networks library
Loading...
Searching...
No Matches
opennn::TabularDataset Class Reference

Tabular dataset with CSV loading, scaling, descriptive statistics and correlation analysis. More...

#include <tabular_dataset.h>

Inheritance diagram for opennn::TabularDataset:
[legend]

Public Types

enum class  MissingValuesMethod { Unuse , Mean , Median , Interpolation }
 Strategy for handling missing values in tabular data. More...
 
- Public Types inherited from opennn::Dataset
enum class  Codification { UTF8 , SHIFT_JIS }
 Text encoding of the source data file. More...
 
enum class  Separator { Space , Tab , Comma , Semicolon }
 Field separator used when reading delimited text files. More...
 

Public Member Functions

 TabularDataset (const Index=0, const Shape &={0}, const Shape &={0})
 Creates a tabular dataset with the given sample count and input/target shapes.
 
 TabularDataset (const filesystem::path &, const string &, bool=true, bool=false, const Codification &=Codification::UTF8)
 Creates a tabular dataset by reading the given file with the given separator.
 
void set (const filesystem::path &, const string &, bool=true, bool=false, const Codification &=Codification::UTF8)
 Resets the dataset from the given file using the given parsing options.
 
void set (const filesystem::path &)
 Resets the dataset by reading from the given path with default parsing options.
 
vector< string > get_feature_scalers (const string &) const
 Returns the scaler names for variables matching the given role.
 
void set_variable_scalers (const string &)
 Sets every variable's scaler from the given name.
 
void set_variable_scalers (const vector< string > &)
 Sets each variable's scaler from the corresponding name in the list.
 
void set_default_variable_scalers ()
 Assigns default scalers based on each variable's type.
 
void set_gmt (const Index new_gmt)
 
DateFormat infer_dataset_date_format (const vector< Variable > &, const vector< vector< string_view > > &, bool, const string &)
 Infers the date/time format used in the given file preview rows.
 
MissingValuesMethod get_missing_values_method () const
 
string get_missing_values_method_string () const
 Returns the missing-values method as its enumerator name.
 
const string & get_missing_values_label () const
 
Index get_missing_values_number () const
 
void set_missing_values_label (string label)
 
void set_missing_values_method (const MissingValuesMethod &method)
 
void set_missing_values_method (const string &)
 Sets the missing-values method from its enumerator name.
 
bool has_missing_values (const vector< string_view > &) const
 Returns true if any field in the row matches the missing-values label.
 
void scrub_missing_values () override
 Removes or imputes missing values using the configured MissingValuesMethod.
 
void calculate_missing_values_statistics ()
 Refreshes per-variable and per-row missing-value counts.
 
void impute_missing_values_statistic (const MissingValuesMethod &)
 Imputes missing values using the given statistic-based method (Mean/Median).
 
virtual void impute_missing_values_unuse ()
 Marks samples containing missing values as unused.
 
virtual void impute_missing_values_interpolate ()
 Fills missing values by interpolating between neighboring samples.
 
vector< string > unuse_uncorrelated_variables (const float=0.25f)
 Marks input variables whose absolute correlation with targets is below the threshold as unused.
 
vector< string > unuse_collinear_variables (const float=0.95f)
 Marks input variables that are highly collinear (above the threshold) as unused.
 
vector< Descriptivescalculate_feature_descriptives () const
 Returns descriptive statistics for every feature across all samples.
 
vector< Descriptivescalculate_feature_descriptives (const string &) const override
 Returns descriptive statistics for features with the given role name.
 
vector< Descriptivescalculate_variable_descriptives_positive_samples () const
 Returns variable descriptives computed only over samples with positive target.
 
vector< Descriptivescalculate_variable_descriptives_negative_samples () const
 Returns variable descriptives computed only over samples with negative target.
 
vector< Descriptivescalculate_variable_descriptives_categories (const Index) const
 Returns variable descriptives restricted to the given target class index.
 
vector< Histogramcalculate_variable_distributions (const Index=10) const
 Returns per-variable histograms with the given bin count.
 
vector< BoxPlotcalculate_variables_box_plots () const
 Returns per-variable Tukey box-plot summaries.
 
Tensor< Correlation, 2 > calculate_input_variable_correlations (Correlation(*)(const MatrixR &, const MatrixR &), Correlation::Method, const string &) const
 Returns the input-input correlation matrix using the given correlation function and method.
 
Tensor< Correlation, 2 > calculate_input_variable_pearson_correlations () const
 Returns the Pearson correlation matrix among input variables.
 
Tensor< Correlation, 2 > calculate_input_variable_spearman_correlations () const
 Returns the Spearman correlation matrix among input variables.
 
Tensor< Correlation, 2 > calculate_input_target_variable_correlations (Correlation(*)(const MatrixR &, const MatrixR &), const string &) const
 Returns the input-target correlation matrix using the given correlation function.
 
Tensor< Correlation, 2 > calculate_input_target_variable_pearson_correlations () const override
 Returns the Pearson correlation matrix between input and target variables.
 
Tensor< Correlation, 2 > calculate_input_target_variable_spearman_correlations () const
 Returns the Spearman correlation matrix between input and target variables.
 
VectorI calculate_correlations_rank () const override
 Returns input variable indices ranked by absolute correlation with the target.
 
vector< Descriptivesscale_data ()
 Scales every feature column using its configured scaler; returns the applied descriptives.
 
vector< Descriptivesscale_features (const string &) override
 Scales the features with the given role using their configured scalers.
 
void unscale_features (const string &, const vector< Descriptives > &) override
 Reverts the scaling of features with the given role using the supplied descriptives.
 
VectorI calculate_target_distribution () const override
 Returns the distribution of target classes for classification datasets.
 
vector< vector< Index > > calculate_Tukey_outliers (const float=1.5f, bool=false)
 Detects Tukey outliers per variable using the given fence multiplier.
 
vector< vector< Index > > replace_Tukey_outliers_with_NaN (const float=1.5f)
 Replaces detected Tukey outliers with NaN and returns the affected indices.
 
void unuse_Tukey_outliers (const float=1.5f)
 Marks samples containing Tukey outliers as unused.
 
void set_data_random () override
 Fills the data matrix with random values.
 
void set_data_integer (const Index vocabulary_size) override
 Fills the data matrix with random integers up to the given vocabulary size.
 
void set_data_rosenbrock ()
 Fills the data matrix with samples drawn from the Rosenbrock function (regression test data).
 
void set_data_binary_classification ()
 Fills the data matrix with a synthetic binary classification dataset.
 
void from_JSON (const JsonDocument &) override
 Loads dataset state from a JSON document.
 
void to_JSON (JsonWriter &) const override
 Writes dataset state to a JSON writer.
 
void read_csv ()
 Reads the configured CSV file into the dataset, inferring types and headers.
 
void set (const Index=0, const Shape &={}, const Shape &={})
 Resets the dataset with the given sample count and input/target shapes.
 
- Public Member Functions inherited from opennn::Dataset
virtual ~Dataset ()=default
 
virtual Index get_samples_number () const
 Returns the total number of samples (rows) in the data matrix.
 
Index get_samples_number (const string &) const
 Returns the number of samples with the given role ("Training", "Validation", ...).
 
Index get_used_samples_number () const
 Returns the number of samples whose role is not None.
 
vector< Index > get_sample_indices (const string &) const
 Returns indices of samples with the given role name.
 
vector< Index > get_used_sample_indices () const
 Returns indices of all samples whose role is not None.
 
const vector< SampleRole > & get_sample_roles () const
 Returns the per-sample role vector.
 
vector< Index > get_sample_roles_vector () const
 Returns the per-sample roles as integer indices.
 
VectorI get_sample_role_numbers () const
 Returns the per-sample roles as a tensor of integer indices.
 
Index get_variables_number () const
 Returns the total number of variables (columns descriptors).
 
Index get_variables_number (const string &) const
 Returns the number of variables with the given role name.
 
Index get_used_variables_number () const
 Returns the number of variables whose role is in use.
 
const vector< Variable > & get_variables () const
 Returns the variable descriptors.
 
vector< Variableget_variables (const string &) const
 Returns the variables with the given role name.
 
Index get_variable_index (const string &) const
 Returns the index of the variable with the given name.
 
Index get_variable_index (const Index) const
 Returns the variable index corresponding to a flat feature index.
 
vector< Index > get_variable_indices (const string &) const
 Returns the indices of variables matching the given role name.
 
vector< Index > get_used_variables_indices () const
 Returns the indices of all in-use variables.
 
vector< string > get_variable_names () const
 Returns the names of all variables.
 
vector< string > get_variable_names (const string &) const
 Returns the names of variables with the given role name.
 
VariableType get_variable_type (const Index index) const
 Returns the VariableType of the variable at the given index.
 
vector< VariableTypeget_variable_types (const vector< Index > &indices) const
 Returns the VariableType for each of the given variable indices.
 
Index get_features_number () const
 Returns the total number of features (data matrix columns).
 
Index get_features_number (const string &) const
 Returns the number of features for variables with the given role name.
 
Index get_used_features_number () const
 Returns the number of features for in-use variables.
 
vector< string > get_feature_names () const
 Returns the expanded feature names (one entry per data matrix column).
 
vector< string > get_feature_names (const string &) const
 Returns the expanded feature names restricted to the given role.
 
vector< vector< Index > > get_feature_indices () const
 Returns the data column indices grouped per variable.
 
vector< Index > get_feature_indices (const Index) const
 Returns the data column indices that belong to the given variable index.
 
vector< Index > get_feature_indices (const string &) const
 Returns the data column indices that belong to variables with the given role name.
 
vector< Index > get_used_feature_indices () const
 Returns the data column indices that belong to in-use variables.
 
vector< Index > get_feature_dimensions () const
 Returns the per-variable feature dimension counts.
 
Shape get_shape (const string &) const
 Returns the configured Shape for the given role ("Input", "Target", "Decoder").
 
virtual void get_batches (const vector< Index > &, Index, bool, vector< vector< Index > > &) const
 Splits sample indices into batches and writes them into batches.
 
const vector< vector< string > > & get_data_file_preview () const
 Returns the parsed preview rows captured during the last file read.
 
const filesystem::path & get_data_path () const
 Returns the configured data file path.
 
const Separatorget_separator () const
 Returns the current field Separator.
 
string get_separator_string () const
 Returns the separator as the literal character used in files.
 
string get_separator_name () const
 Returns the separator as its enumerator name ("Space", "Tab", ...).
 
const Codificationget_codification () const
 Returns the configured text Codification.
 
string get_codification_string () const
 Returns the codification as its enumerator name.
 
bool get_display () const
 Returns whether progress messages are printed.
 
virtual bool is_empty () const
 Returns true when the dataset contains no samples.
 
Shape get_input_shape () const
 Returns the configured input tensor shape.
 
Shape get_target_shape () const
 Returns the configured target tensor shape.
 
const MatrixRget_data () const
 Returns the raw data matrix (rows = samples, columns = features).
 
void set_data (const MatrixR &)
 Replaces the underlying data matrix.
 
void set_data_constant (const float)
 Fills the data matrix with the given constant value.
 
void set_default ()
 Restores default dataset settings.
 
void set_sample_roles (const string &)
 Assigns the same role to every sample.
 
void set_sample_role (const Index, const string &)
 Sets the role of a single sample by index.
 
void set_sample_roles (const vector< string > &)
 Assigns a role per sample from a string vector.
 
void set_sample_roles (const vector< Index > &, const string &)
 Assigns the same role to the given sample indices.
 
void set_variables (const vector< Variable > &new_variables)
 
void set_default_variable_names ()
 Assigns default placeholder names to all variables.
 
void set_variable_roles (const vector< string > &)
 Sets the role of every variable from the given string list.
 
void set_variable_indices (const vector< Index > &, const vector< Index > &)
 Sets which variable indices are inputs and which are targets.
 
void set_input_variables_unused ()
 Marks all input variables as unused (role None).
 
void set_variable_role (const Index, const string &)
 Sets the role of the variable at the given index.
 
void set_variable_role (const string &, const string &)
 Sets the role of the variable with the given name.
 
void set_variable_type (const Index, const VariableType &)
 Sets the type of the variable at the given index.
 
void set_variable_type (const string &, const VariableType &)
 Sets the type of the variable with the given name.
 
void set_variable_types (const VariableType &)
 Sets every variable to the given type.
 
void set_binary_variables ()
 Detects and marks variables with binary values as VariableType::Binary.
 
void set_variable_names (const vector< string > &)
 Assigns names to all variables from the given list.
 
void set_variables_number (const Index new_size)
 
void set_feature_names (const vector< string > &)
 Assigns expanded feature names, propagating categories back to variables.
 
void set_variable_roles (const string &)
 Assigns the same role to every variable.
 
void set_shape (const string &, const Shape &)
 Sets the tensor Shape associated with the given role ("Input"/"Target"/"Decoder").
 
virtual void resize_input_shape (Index input_features_count)
 Resizes the input shape to the given flat feature count.
 
void set_data_path (const filesystem::path &new_data_path)
 
void set_has_header (bool new_has_header)
 
void set_has_ids (bool new_has_ids)
 
void set_separator (const Separator &new_separator)
 
void set_separator_string (const string &)
 Sets the separator from its literal character.
 
void set_separator_name (const string &)
 Sets the separator from its enumerator name.
 
void set_codification (const Codification &new_codification)
 
void set_codification (const string &)
 Sets the codification from its enumerator name.
 
void set_display (bool new_display)
 
bool is_sample_used (const Index i) const
 Returns true if the sample at i has a role other than None.
 
bool has_binary_variables () const
 Returns true if any variable has type Binary.
 
bool has_categorical_variables () const
 Returns true if any variable has type Categorical.
 
bool has_binary_or_categorical_variables () const
 Returns true if any variable is Binary or Categorical.
 
bool has_time_variable () const
 Returns true if any variable has role Time.
 
bool has_validation () const
 Returns true if any sample is assigned the Validation role.
 
void split_samples (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f, bool shuffle=true)
 Splits samples into Training/Validation/Testing roles, optionally shuffled.
 
void split_samples_sequential (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
 Splits samples sequentially without shuffling.
 
void split_samples_random (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
 Splits samples randomly across roles.
 
vector< vector< Index > > split_samples (const vector< Index > &, Index) const
 Splits the given indices into chunks of the requested size.
 
MatrixR get_data (const string &, const string &) const
 Returns the data submatrix for the given sample role and feature role.
 
MatrixR get_data_from_indices (const vector< Index > &, const vector< Index > &) const
 Returns a submatrix from the data using explicit row and column indices.
 
VectorR get_sample_data (const Index) const
 Returns the row of the data matrix at the given sample index.
 
MatrixR get_variable_data (const Index) const
 Returns the data columns belonging to the variable at index.
 
MatrixR get_variable_data (const Index, const vector< Index > &) const
 Returns the variable data restricted to the given sample indices.
 
MatrixR get_variable_data (const string &) const
 Returns the data columns of the variable with the given name.
 
MatrixR get_feature_data (const string &) const
 Returns the data columns belonging to features with the given role name.
 
void set (const Index=0, const Shape &={}, const Shape &={})
 Resets the dataset with the given sample count and input/target shapes.
 
bool has_nan () const
 Returns true if any entry in the data matrix is NaN.
 
bool has_nan_row (const Index) const
 Returns true if the sample at the given row contains a NaN.
 
VectorI count_nans_per_variable () const
 Returns the NaN count for each variable.
 
Index count_variables_with_nan () const
 Returns the number of variables that contain at least one NaN.
 
Index count_rows_with_nan () const
 Returns the number of rows that contain at least one NaN.
 
Index count_nan () const
 Returns the total NaN count in the data matrix.
 
void save (const filesystem::path &) const
 Saves the dataset metadata to a JSON file at the given path.
 
void load (const filesystem::path &)
 Loads the dataset metadata from the JSON file at the given path.
 
void save_data () const
 Saves the data matrix to the configured data path.
 
void save_data_binary (const filesystem::path &) const
 Saves the data matrix to the given path in binary form.
 
void load_data_binary ()
 Loads the data matrix from the binary file at the configured data path.
 
virtual void fill_inputs (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const
 Copies input features of the given samples into a destination buffer.
 
virtual void augment_inputs (float *, Index) const
 Applies data augmentation in place to the input buffer (no-op in base class).
 
virtual void fill_decoder (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const
 Copies decoder input features into a destination buffer (sequence models).
 
virtual void fill_targets (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const
 Copies target features of the given samples into a destination buffer.
 

Protected Member Functions

void missing_values_to_JSON (JsonWriter &) const
 
void missing_values_from_JSON (const Json *)
 
void infer_column_types (const vector< vector< string_view > > &)
 
vector< Index > filter_used_samples_by_column (Index, bool) const
 
void apply_scaler (Index, const string &, const Descriptives &, bool)
 
- Protected Member Functions inherited from opennn::Dataset
 Dataset ()=default
 
void set_default_variable_roles ()
 
void set_default_variable_roles_forecasting ()
 
void infer_variable_types_from_data ()
 
void read_data_file_preview (const vector< vector< string_view > > &)
 
void check_separators (string_view) const
 
void samples_from_JSON (const Json *)
 
void variables_to_JSON (JsonWriter &) const
 
void samples_to_JSON (JsonWriter &) const
 
void preview_data_to_JSON (JsonWriter &) const
 
void variables_from_JSON (const Json *)
 
void preview_data_from_JSON (const Json *)
 

Protected Attributes

string missing_values_label = "NA"
 
MissingValuesMethod missing_values_method = MissingValuesMethod::Mean
 
Index missing_values_number = 0
 
VectorI variables_missing_values_number
 
Index rows_missing_values_number = 0
 
Index gmt = 0
 
- Protected Attributes inherited from opennn::Dataset
Shape input_shape
 
Shape target_shape
 
Shape decoder_shape
 
vector< SampleRolesample_roles
 
vector< string > sample_ids
 
vector< Variablevariables
 
MatrixR data
 
filesystem::path data_path
 
Separator separator = Separator::Comma
 
bool has_header = false
 
bool has_sample_ids = false
 
Codification codification = Codification::UTF8
 
vector< vector< string > > data_file_preview
 
bool display = true
 
const vector< string > positive_words = {"1", "yes", "positive", "+", "true", "good", "si", "sí", "Sí"}
 
const vector< string > negative_words = {"0", "no", "negative", "-", "false", "bad", "not", "No"}
 

Detailed Description

Tabular dataset with CSV loading, scaling, descriptive statistics and correlation analysis.

Member Enumeration Documentation

◆ MissingValuesMethod

Strategy for handling missing values in tabular data.

Enumerator
Unuse 
Mean 
Median 
Interpolation 

Constructor & Destructor Documentation

◆ TabularDataset() [1/2]

opennn::TabularDataset::TabularDataset ( const Index = 0,
const Shape & = {0},
const Shape & = {0} )

Creates a tabular dataset with the given sample count and input/target shapes.

Parameters
sample_countNumber of samples to allocate.
input_shapeShape of input features.
target_shapeShape of target features.

◆ TabularDataset() [2/2]

opennn::TabularDataset::TabularDataset ( const filesystem::path & ,
const string & ,
bool = true,
bool = false,
const Codification & = Codification::UTF8 )

Creates a tabular dataset by reading the given file with the given separator.

Parameters
data_pathPath to the source CSV/text file.
separatorField separator name ("Space", "Tab", "Comma", "Semicolon").
has_headerWhether the first row contains column names.
has_idsWhether the first column contains sample identifiers.
codificationText encoding of the file.

Member Function Documentation

◆ apply_scaler()

void opennn::TabularDataset::apply_scaler ( Index ,
const string & ,
const Descriptives & ,
bool  )
protected

◆ calculate_correlations_rank()

VectorI opennn::TabularDataset::calculate_correlations_rank ( ) const
overridevirtual

Returns input variable indices ranked by absolute correlation with the target.

Reimplemented from opennn::Dataset.

◆ calculate_feature_descriptives() [1/2]

vector< Descriptives > opennn::TabularDataset::calculate_feature_descriptives ( ) const

Returns descriptive statistics for every feature across all samples.

◆ calculate_feature_descriptives() [2/2]

vector< Descriptives > opennn::TabularDataset::calculate_feature_descriptives ( const string & ) const
overridevirtual

Returns descriptive statistics for features with the given role name.

Reimplemented from opennn::Dataset.

◆ calculate_input_target_variable_correlations()

Tensor< Correlation, 2 > opennn::TabularDataset::calculate_input_target_variable_correlations ( Correlation(*  )(const MatrixR &, const MatrixR &),
const string &  ) const

Returns the input-target correlation matrix using the given correlation function.

◆ calculate_input_target_variable_pearson_correlations()

Tensor< Correlation, 2 > opennn::TabularDataset::calculate_input_target_variable_pearson_correlations ( ) const
overridevirtual

Returns the Pearson correlation matrix between input and target variables.

Reimplemented from opennn::Dataset.

◆ calculate_input_target_variable_spearman_correlations()

Tensor< Correlation, 2 > opennn::TabularDataset::calculate_input_target_variable_spearman_correlations ( ) const

Returns the Spearman correlation matrix between input and target variables.

◆ calculate_input_variable_correlations()

Tensor< Correlation, 2 > opennn::TabularDataset::calculate_input_variable_correlations ( Correlation(*  )(const MatrixR &, const MatrixR &),
Correlation::Method ,
const string &  ) const

Returns the input-input correlation matrix using the given correlation function and method.

◆ calculate_input_variable_pearson_correlations()

Tensor< Correlation, 2 > opennn::TabularDataset::calculate_input_variable_pearson_correlations ( ) const

Returns the Pearson correlation matrix among input variables.

◆ calculate_input_variable_spearman_correlations()

Tensor< Correlation, 2 > opennn::TabularDataset::calculate_input_variable_spearman_correlations ( ) const

Returns the Spearman correlation matrix among input variables.

◆ calculate_missing_values_statistics()

void opennn::TabularDataset::calculate_missing_values_statistics ( )

Refreshes per-variable and per-row missing-value counts.

◆ calculate_target_distribution()

VectorI opennn::TabularDataset::calculate_target_distribution ( ) const
overridevirtual

Returns the distribution of target classes for classification datasets.

Reimplemented from opennn::Dataset.

◆ calculate_Tukey_outliers()

vector< vector< Index > > opennn::TabularDataset::calculate_Tukey_outliers ( const float = 1.5f,
bool = false )

Detects Tukey outliers per variable using the given fence multiplier.

Parameters
cleaning_parameterMultiplier applied to the interquartile range.
use_only_used_samplesIf true, restrict the analysis to in-use samples.
Returns
Outlier row indices per variable.

◆ calculate_variable_descriptives_categories()

vector< Descriptives > opennn::TabularDataset::calculate_variable_descriptives_categories ( const Index ) const

Returns variable descriptives restricted to the given target class index.

◆ calculate_variable_descriptives_negative_samples()

vector< Descriptives > opennn::TabularDataset::calculate_variable_descriptives_negative_samples ( ) const

Returns variable descriptives computed only over samples with negative target.

◆ calculate_variable_descriptives_positive_samples()

vector< Descriptives > opennn::TabularDataset::calculate_variable_descriptives_positive_samples ( ) const

Returns variable descriptives computed only over samples with positive target.

◆ calculate_variable_distributions()

vector< Histogram > opennn::TabularDataset::calculate_variable_distributions ( const Index = 10) const

Returns per-variable histograms with the given bin count.

◆ calculate_variables_box_plots()

vector< BoxPlot > opennn::TabularDataset::calculate_variables_box_plots ( ) const

Returns per-variable Tukey box-plot summaries.

◆ filter_used_samples_by_column()

vector< Index > opennn::TabularDataset::filter_used_samples_by_column ( Index ,
bool  ) const
protected

◆ from_JSON()

void opennn::TabularDataset::from_JSON ( const JsonDocument & )
overridevirtual

Loads dataset state from a JSON document.

Implements opennn::Dataset.

Reimplemented in opennn::TimeSeriesDataset.

◆ get_feature_scalers()

vector< string > opennn::TabularDataset::get_feature_scalers ( const string & ) const

Returns the scaler names for variables matching the given role.

◆ get_missing_values_label()

const string & opennn::TabularDataset::get_missing_values_label ( ) const
inline

◆ get_missing_values_method()

MissingValuesMethod opennn::TabularDataset::get_missing_values_method ( ) const
inline

◆ get_missing_values_method_string()

string opennn::TabularDataset::get_missing_values_method_string ( ) const

Returns the missing-values method as its enumerator name.

◆ get_missing_values_number()

Index opennn::TabularDataset::get_missing_values_number ( ) const
inline

◆ has_missing_values()

bool opennn::TabularDataset::has_missing_values ( const vector< string_view > & ) const

Returns true if any field in the row matches the missing-values label.

◆ impute_missing_values_interpolate()

virtual void opennn::TabularDataset::impute_missing_values_interpolate ( )
virtual

Fills missing values by interpolating between neighboring samples.

Reimplemented in opennn::TimeSeriesDataset.

◆ impute_missing_values_statistic()

void opennn::TabularDataset::impute_missing_values_statistic ( const MissingValuesMethod & )

Imputes missing values using the given statistic-based method (Mean/Median).

◆ impute_missing_values_unuse()

virtual void opennn::TabularDataset::impute_missing_values_unuse ( )
virtual

Marks samples containing missing values as unused.

Reimplemented in opennn::TimeSeriesDataset.

◆ infer_column_types()

void opennn::TabularDataset::infer_column_types ( const vector< vector< string_view > > & )
protected

◆ infer_dataset_date_format()

DateFormat opennn::TabularDataset::infer_dataset_date_format ( const vector< Variable > & ,
const vector< vector< string_view > > & ,
bool ,
const string &  )

Infers the date/time format used in the given file preview rows.

Parameters
variablesVariable descriptors used to locate date columns.
sample_rowsPreview rows of the source file.
has_headerWhether the first row is a header.
separatorField separator string.
Returns
Detected date format.

◆ missing_values_from_JSON()

void opennn::TabularDataset::missing_values_from_JSON ( const Json * )
protected

◆ missing_values_to_JSON()

void opennn::TabularDataset::missing_values_to_JSON ( JsonWriter & ) const
protected

◆ read_csv()

void opennn::TabularDataset::read_csv ( )

Reads the configured CSV file into the dataset, inferring types and headers.

◆ replace_Tukey_outliers_with_NaN()

vector< vector< Index > > opennn::TabularDataset::replace_Tukey_outliers_with_NaN ( const float = 1.5f)

Replaces detected Tukey outliers with NaN and returns the affected indices.

◆ scale_data()

vector< Descriptives > opennn::TabularDataset::scale_data ( )

Scales every feature column using its configured scaler; returns the applied descriptives.

◆ scale_features()

vector< Descriptives > opennn::TabularDataset::scale_features ( const string & )
overridevirtual

Scales the features with the given role using their configured scalers.

Reimplemented from opennn::Dataset.

◆ scrub_missing_values()

void opennn::TabularDataset::scrub_missing_values ( )
overridevirtual

Removes or imputes missing values using the configured MissingValuesMethod.

Reimplemented from opennn::Dataset.

◆ set() [1/3]

void opennn::TabularDataset::set ( const filesystem::path & )

Resets the dataset by reading from the given path with default parsing options.

◆ set() [2/3]

void opennn::TabularDataset::set ( const filesystem::path & ,
const string & ,
bool = true,
bool = false,
const Codification & = Codification::UTF8 )

Resets the dataset from the given file using the given parsing options.

Parameters
data_pathPath to the source CSV/text file.
separatorField separator name.
has_headerWhether the first row contains column names.
has_idsWhether the first column contains sample identifiers.
codificationText encoding of the file.

◆ set() [3/3]

void opennn::Dataset::set ( const Index = 0,
const Shape & = {},
const Shape & = {} )

Resets the dataset with the given sample count and input/target shapes.

Parameters
sample_countNumber of samples to allocate.
input_shapeShape of input features.
target_shapeShape of target features.

◆ set_data_binary_classification()

void opennn::TabularDataset::set_data_binary_classification ( )

Fills the data matrix with a synthetic binary classification dataset.

◆ set_data_integer()

void opennn::TabularDataset::set_data_integer ( const Index vocabulary_size)
overridevirtual

Fills the data matrix with random integers up to the given vocabulary size.

Reimplemented from opennn::Dataset.

◆ set_data_random()

void opennn::TabularDataset::set_data_random ( )
overridevirtual

Fills the data matrix with random values.

Reimplemented from opennn::Dataset.

◆ set_data_rosenbrock()

void opennn::TabularDataset::set_data_rosenbrock ( )

Fills the data matrix with samples drawn from the Rosenbrock function (regression test data).

◆ set_default_variable_scalers()

void opennn::TabularDataset::set_default_variable_scalers ( )

Assigns default scalers based on each variable's type.

◆ set_gmt()

void opennn::TabularDataset::set_gmt ( const Index new_gmt)
inline

◆ set_missing_values_label()

void opennn::TabularDataset::set_missing_values_label ( string label)
inline

◆ set_missing_values_method() [1/2]

void opennn::TabularDataset::set_missing_values_method ( const MissingValuesMethod & method)
inline

◆ set_missing_values_method() [2/2]

void opennn::TabularDataset::set_missing_values_method ( const string & )

Sets the missing-values method from its enumerator name.

◆ set_variable_scalers() [1/2]

void opennn::TabularDataset::set_variable_scalers ( const string & )

Sets every variable's scaler from the given name.

◆ set_variable_scalers() [2/2]

void opennn::TabularDataset::set_variable_scalers ( const vector< string > & )

Sets each variable's scaler from the corresponding name in the list.

◆ to_JSON()

void opennn::TabularDataset::to_JSON ( JsonWriter & ) const
overridevirtual

Writes dataset state to a JSON writer.

Reimplemented from opennn::Dataset.

Reimplemented in opennn::TimeSeriesDataset.

◆ unscale_features()

void opennn::TabularDataset::unscale_features ( const string & ,
const vector< Descriptives > &  )
overridevirtual

Reverts the scaling of features with the given role using the supplied descriptives.

Reimplemented from opennn::Dataset.

◆ unuse_collinear_variables()

vector< string > opennn::TabularDataset::unuse_collinear_variables ( const float = 0.95f)

Marks input variables that are highly collinear (above the threshold) as unused.

Returns
Names of the variables that were unused.

◆ unuse_Tukey_outliers()

void opennn::TabularDataset::unuse_Tukey_outliers ( const float = 1.5f)

Marks samples containing Tukey outliers as unused.

◆ unuse_uncorrelated_variables()

vector< string > opennn::TabularDataset::unuse_uncorrelated_variables ( const float = 0.25f)

Marks input variables whose absolute correlation with targets is below the threshold as unused.

Returns
Names of the variables that were unused.

Member Data Documentation

◆ gmt

Index opennn::TabularDataset::gmt = 0
protected

◆ missing_values_label

string opennn::TabularDataset::missing_values_label = "NA"
protected

◆ missing_values_method

MissingValuesMethod opennn::TabularDataset::missing_values_method = MissingValuesMethod::Mean
protected

◆ missing_values_number

Index opennn::TabularDataset::missing_values_number = 0
protected

◆ rows_missing_values_number

Index opennn::TabularDataset::rows_missing_values_number = 0
protected

◆ variables_missing_values_number

VectorI opennn::TabularDataset::variables_missing_values_number
protected