OpenNN
Open-source neural networks library
Loading...
Searching...
No Matches
opennn::Dataset Class Referenceabstract

Abstract base class for OpenNN datasets, owning samples, variables, and metadata. More...

#include <dataset.h>

Inheritance diagram for opennn::Dataset:
[legend]

Public Types

enum class  Codification { UTF8 , SHIFT_JIS }
 Text encoding of the source data file. More...
 
enum class  Separator { Space , Tab , Comma , Semicolon }
 Field separator used when reading delimited text files. More...
 

Public Member Functions

virtual ~Dataset ()=default
 
virtual Index get_samples_number () const
 Returns the total number of samples (rows) in the data matrix.
 
Index get_samples_number (const string &) const
 Returns the number of samples with the given role ("Training", "Validation", ...).
 
Index get_used_samples_number () const
 Returns the number of samples whose role is not None.
 
vector< Index > get_sample_indices (const string &) const
 Returns indices of samples with the given role name.
 
vector< Index > get_used_sample_indices () const
 Returns indices of all samples whose role is not None.
 
const vector< SampleRole > & get_sample_roles () const
 Returns the per-sample role vector.
 
vector< Index > get_sample_roles_vector () const
 Returns the per-sample roles as integer indices.
 
VectorI get_sample_role_numbers () const
 Returns the per-sample roles as a tensor of integer indices.
 
Index get_variables_number () const
 Returns the total number of variables (columns descriptors).
 
Index get_variables_number (const string &) const
 Returns the number of variables with the given role name.
 
Index get_used_variables_number () const
 Returns the number of variables whose role is in use.
 
const vector< Variable > & get_variables () const
 Returns the variable descriptors.
 
vector< Variableget_variables (const string &) const
 Returns the variables with the given role name.
 
Index get_variable_index (const string &) const
 Returns the index of the variable with the given name.
 
Index get_variable_index (const Index) const
 Returns the variable index corresponding to a flat feature index.
 
vector< Index > get_variable_indices (const string &) const
 Returns the indices of variables matching the given role name.
 
vector< Index > get_used_variables_indices () const
 Returns the indices of all in-use variables.
 
vector< string > get_variable_names () const
 Returns the names of all variables.
 
vector< string > get_variable_names (const string &) const
 Returns the names of variables with the given role name.
 
VariableType get_variable_type (const Index index) const
 Returns the VariableType of the variable at the given index.
 
vector< VariableTypeget_variable_types (const vector< Index > &indices) const
 Returns the VariableType for each of the given variable indices.
 
Index get_features_number () const
 Returns the total number of features (data matrix columns).
 
Index get_features_number (const string &) const
 Returns the number of features for variables with the given role name.
 
Index get_used_features_number () const
 Returns the number of features for in-use variables.
 
vector< string > get_feature_names () const
 Returns the expanded feature names (one entry per data matrix column).
 
vector< string > get_feature_names (const string &) const
 Returns the expanded feature names restricted to the given role.
 
vector< vector< Index > > get_feature_indices () const
 Returns the data column indices grouped per variable.
 
vector< Index > get_feature_indices (const Index) const
 Returns the data column indices that belong to the given variable index.
 
vector< Index > get_feature_indices (const string &) const
 Returns the data column indices that belong to variables with the given role name.
 
vector< Index > get_used_feature_indices () const
 Returns the data column indices that belong to in-use variables.
 
vector< Index > get_feature_dimensions () const
 Returns the per-variable feature dimension counts.
 
Shape get_shape (const string &) const
 Returns the configured Shape for the given role ("Input", "Target", "Decoder").
 
virtual void get_batches (const vector< Index > &, Index, bool, vector< vector< Index > > &) const
 Splits sample indices into batches and writes them into batches.
 
const vector< vector< string > > & get_data_file_preview () const
 Returns the parsed preview rows captured during the last file read.
 
const filesystem::path & get_data_path () const
 Returns the configured data file path.
 
const Separatorget_separator () const
 Returns the current field Separator.
 
string get_separator_string () const
 Returns the separator as the literal character used in files.
 
string get_separator_name () const
 Returns the separator as its enumerator name ("Space", "Tab", ...).
 
const Codificationget_codification () const
 Returns the configured text Codification.
 
string get_codification_string () const
 Returns the codification as its enumerator name.
 
bool get_display () const
 Returns whether progress messages are printed.
 
virtual bool is_empty () const
 Returns true when the dataset contains no samples.
 
Shape get_input_shape () const
 Returns the configured input tensor shape.
 
Shape get_target_shape () const
 Returns the configured target tensor shape.
 
const MatrixRget_data () const
 Returns the raw data matrix (rows = samples, columns = features).
 
void set_data (const MatrixR &)
 Replaces the underlying data matrix.
 
void set_data_constant (const float)
 Fills the data matrix with the given constant value.
 
void set_default ()
 Restores default dataset settings.
 
void set_sample_roles (const string &)
 Assigns the same role to every sample.
 
void set_sample_role (const Index, const string &)
 Sets the role of a single sample by index.
 
void set_sample_roles (const vector< string > &)
 Assigns a role per sample from a string vector.
 
void set_sample_roles (const vector< Index > &, const string &)
 Assigns the same role to the given sample indices.
 
void set_variables (const vector< Variable > &new_variables)
 
void set_default_variable_names ()
 Assigns default placeholder names to all variables.
 
void set_variable_roles (const vector< string > &)
 Sets the role of every variable from the given string list.
 
void set_variable_indices (const vector< Index > &, const vector< Index > &)
 Sets which variable indices are inputs and which are targets.
 
void set_input_variables_unused ()
 Marks all input variables as unused (role None).
 
void set_variable_role (const Index, const string &)
 Sets the role of the variable at the given index.
 
void set_variable_role (const string &, const string &)
 Sets the role of the variable with the given name.
 
void set_variable_type (const Index, const VariableType &)
 Sets the type of the variable at the given index.
 
void set_variable_type (const string &, const VariableType &)
 Sets the type of the variable with the given name.
 
void set_variable_types (const VariableType &)
 Sets every variable to the given type.
 
void set_binary_variables ()
 Detects and marks variables with binary values as VariableType::Binary.
 
void set_variable_names (const vector< string > &)
 Assigns names to all variables from the given list.
 
void set_variables_number (const Index new_size)
 
void set_feature_names (const vector< string > &)
 Assigns expanded feature names, propagating categories back to variables.
 
void set_variable_roles (const string &)
 Assigns the same role to every variable.
 
void set_shape (const string &, const Shape &)
 Sets the tensor Shape associated with the given role ("Input"/"Target"/"Decoder").
 
virtual void resize_input_shape (Index input_features_count)
 Resizes the input shape to the given flat feature count.
 
void set_data_path (const filesystem::path &new_data_path)
 
void set_has_header (bool new_has_header)
 
void set_has_ids (bool new_has_ids)
 
void set_separator (const Separator &new_separator)
 
void set_separator_string (const string &)
 Sets the separator from its literal character.
 
void set_separator_name (const string &)
 Sets the separator from its enumerator name.
 
void set_codification (const Codification &new_codification)
 
void set_codification (const string &)
 Sets the codification from its enumerator name.
 
void set_display (bool new_display)
 
bool is_sample_used (const Index i) const
 Returns true if the sample at i has a role other than None.
 
bool has_binary_variables () const
 Returns true if any variable has type Binary.
 
bool has_categorical_variables () const
 Returns true if any variable has type Categorical.
 
bool has_binary_or_categorical_variables () const
 Returns true if any variable is Binary or Categorical.
 
bool has_time_variable () const
 Returns true if any variable has role Time.
 
bool has_validation () const
 Returns true if any sample is assigned the Validation role.
 
void split_samples (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f, bool shuffle=true)
 Splits samples into Training/Validation/Testing roles, optionally shuffled.
 
void split_samples_sequential (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
 Splits samples sequentially without shuffling.
 
void split_samples_random (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
 Splits samples randomly across roles.
 
vector< vector< Index > > split_samples (const vector< Index > &, Index) const
 Splits the given indices into chunks of the requested size.
 
virtual vector< Descriptivesscale_features (const string &)
 Scales the features with the configured method; returns the applied descriptives.
 
virtual void set_data_random ()
 Fills the data matrix with random values (no-op in base class).
 
virtual void set_data_integer (const Index)
 Fills the data matrix with random integers up to the given vocabulary size.
 
virtual void from_JSON (const JsonDocument &)=0
 Loads dataset state from a JSON document.
 
virtual void to_JSON (JsonWriter &) const
 Writes dataset state to a JSON writer.
 
MatrixR get_data (const string &, const string &) const
 Returns the data submatrix for the given sample role and feature role.
 
MatrixR get_data_from_indices (const vector< Index > &, const vector< Index > &) const
 Returns a submatrix from the data using explicit row and column indices.
 
VectorR get_sample_data (const Index) const
 Returns the row of the data matrix at the given sample index.
 
MatrixR get_variable_data (const Index) const
 Returns the data columns belonging to the variable at index.
 
MatrixR get_variable_data (const Index, const vector< Index > &) const
 Returns the variable data restricted to the given sample indices.
 
MatrixR get_variable_data (const string &) const
 Returns the data columns of the variable with the given name.
 
MatrixR get_feature_data (const string &) const
 Returns the data columns belonging to features with the given role name.
 
void set (const Index=0, const Shape &={}, const Shape &={})
 Resets the dataset with the given sample count and input/target shapes.
 
bool has_nan () const
 Returns true if any entry in the data matrix is NaN.
 
bool has_nan_row (const Index) const
 Returns true if the sample at the given row contains a NaN.
 
VectorI count_nans_per_variable () const
 Returns the NaN count for each variable.
 
Index count_variables_with_nan () const
 Returns the number of variables that contain at least one NaN.
 
Index count_rows_with_nan () const
 Returns the number of rows that contain at least one NaN.
 
Index count_nan () const
 Returns the total NaN count in the data matrix.
 
virtual void scrub_missing_values ()
 Removes or imputes missing values using the configured strategy.
 
virtual vector< Descriptivescalculate_feature_descriptives (const string &) const
 Returns descriptive statistics for features with the given role (subclass-specific).
 
virtual Tensor< Correlation, 2 > calculate_input_target_variable_pearson_correlations () const
 Returns the Pearson correlations between input and target variables.
 
virtual VectorI calculate_target_distribution () const
 Returns the distribution of target values.
 
virtual VectorI calculate_correlations_rank () const
 Returns input variables ranked by absolute correlation with the target.
 
virtual void unscale_features (const string &, const vector< Descriptives > &)
 Reverts a previously applied scaling using the supplied descriptives.
 
void save (const filesystem::path &) const
 Saves the dataset metadata to a JSON file at the given path.
 
void load (const filesystem::path &)
 Loads the dataset metadata from the JSON file at the given path.
 
void save_data () const
 Saves the data matrix to the configured data path.
 
void save_data_binary (const filesystem::path &) const
 Saves the data matrix to the given path in binary form.
 
void load_data_binary ()
 Loads the data matrix from the binary file at the configured data path.
 
virtual void fill_inputs (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const
 Copies input features of the given samples into a destination buffer.
 
virtual void augment_inputs (float *, Index) const
 Applies data augmentation in place to the input buffer (no-op in base class).
 
virtual void fill_decoder (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const
 Copies decoder input features into a destination buffer (sequence models).
 
virtual void fill_targets (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const
 Copies target features of the given samples into a destination buffer.
 

Protected Member Functions

 Dataset ()=default
 
void set_default_variable_roles ()
 
void set_default_variable_roles_forecasting ()
 
void infer_variable_types_from_data ()
 
void read_data_file_preview (const vector< vector< string_view > > &)
 
void check_separators (string_view) const
 
void samples_from_JSON (const Json *)
 
void variables_to_JSON (JsonWriter &) const
 
void samples_to_JSON (JsonWriter &) const
 
void preview_data_to_JSON (JsonWriter &) const
 
void variables_from_JSON (const Json *)
 
void preview_data_from_JSON (const Json *)
 

Protected Attributes

Shape input_shape
 
Shape target_shape
 
Shape decoder_shape
 
vector< SampleRolesample_roles
 
vector< string > sample_ids
 
vector< Variablevariables
 
MatrixR data
 
filesystem::path data_path
 
Separator separator = Separator::Comma
 
bool has_header = false
 
bool has_sample_ids = false
 
Codification codification = Codification::UTF8
 
vector< vector< string > > data_file_preview
 
bool display = true
 
const vector< string > positive_words = {"1", "yes", "positive", "+", "true", "good", "si", "sí", "Sí"}
 
const vector< string > negative_words = {"0", "no", "negative", "-", "false", "bad", "not", "No"}
 

Detailed Description

Abstract base class for OpenNN datasets, owning samples, variables, and metadata.

Member Enumeration Documentation

◆ Codification

enum class opennn::Dataset::Codification
strong

Text encoding of the source data file.

Enumerator
UTF8 
SHIFT_JIS 

◆ Separator

enum class opennn::Dataset::Separator
strong

Field separator used when reading delimited text files.

Enumerator
Space 
Tab 
Comma 
Semicolon 

Constructor & Destructor Documentation

◆ ~Dataset()

virtual opennn::Dataset::~Dataset ( )
virtualdefault

◆ Dataset()

opennn::Dataset::Dataset ( )
protecteddefault

Member Function Documentation

◆ augment_inputs()

virtual void opennn::Dataset::augment_inputs ( float * ,
Index  ) const
inlinevirtual

Applies data augmentation in place to the input buffer (no-op in base class).

Reimplemented in opennn::ImageDataset.

◆ calculate_correlations_rank()

virtual VectorI opennn::Dataset::calculate_correlations_rank ( ) const
inlinenodiscardvirtual

Returns input variables ranked by absolute correlation with the target.

Reimplemented in opennn::TabularDataset.

◆ calculate_feature_descriptives()

virtual vector< Descriptives > opennn::Dataset::calculate_feature_descriptives ( const string & ) const
inlinenodiscardvirtual

Returns descriptive statistics for features with the given role (subclass-specific).

Reimplemented in opennn::TabularDataset.

◆ calculate_input_target_variable_pearson_correlations()

virtual Tensor< Correlation, 2 > opennn::Dataset::calculate_input_target_variable_pearson_correlations ( ) const
inlinenodiscardvirtual

Returns the Pearson correlations between input and target variables.

Reimplemented in opennn::TabularDataset.

◆ calculate_target_distribution()

virtual VectorI opennn::Dataset::calculate_target_distribution ( ) const
inlinenodiscardvirtual

Returns the distribution of target values.

Reimplemented in opennn::LanguageDataset, and opennn::TabularDataset.

◆ check_separators()

void opennn::Dataset::check_separators ( string_view ) const
protected

◆ count_nan()

Index opennn::Dataset::count_nan ( ) const
nodiscard

Returns the total NaN count in the data matrix.

◆ count_nans_per_variable()

VectorI opennn::Dataset::count_nans_per_variable ( ) const
nodiscard

Returns the NaN count for each variable.

◆ count_rows_with_nan()

Index opennn::Dataset::count_rows_with_nan ( ) const
nodiscard

Returns the number of rows that contain at least one NaN.

◆ count_variables_with_nan()

Index opennn::Dataset::count_variables_with_nan ( ) const
nodiscard

Returns the number of variables that contain at least one NaN.

◆ fill_decoder()

virtual void opennn::Dataset::fill_decoder ( const vector< Index > & ,
const vector< Index > & ,
float * ,
bool is_training,
bool parallelize = true,
int contiguous = -1 ) const
virtual

Copies decoder input features into a destination buffer (sequence models).

Reimplemented in opennn::LanguageDataset.

◆ fill_inputs()

virtual void opennn::Dataset::fill_inputs ( const vector< Index > & ,
const vector< Index > & ,
float * ,
bool is_training,
bool parallelize = true,
int contiguous = -1 ) const
virtual

Copies input features of the given samples into a destination buffer.

Parameters
sample_indicesRow indices to read.
variable_indicesVariable indices selecting which features to copy.
destinationOutput pointer that receives the flattened tensor.
is_trainingTrue when the call is part of a training batch.
parallelizeIf true, parallelize the copy.
contiguousStride hint for contiguous copies (-1 = auto).

Reimplemented in opennn::ImageDataset, opennn::LanguageDataset, and opennn::TimeSeriesDataset.

◆ fill_targets()

virtual void opennn::Dataset::fill_targets ( const vector< Index > & ,
const vector< Index > & ,
float * ,
bool is_training,
bool parallelize = true,
int contiguous = -1 ) const
virtual

Copies target features of the given samples into a destination buffer.

Reimplemented in opennn::ImageDataset, opennn::LanguageDataset, and opennn::TimeSeriesDataset.

◆ from_JSON()

virtual void opennn::Dataset::from_JSON ( const JsonDocument & )
pure virtual

Loads dataset state from a JSON document.

Implemented in opennn::ImageDataset, opennn::LanguageDataset, opennn::TabularDataset, and opennn::TimeSeriesDataset.

◆ get_batches()

virtual void opennn::Dataset::get_batches ( const vector< Index > & ,
Index ,
bool ,
vector< vector< Index > > &  ) const
virtual

Splits sample indices into batches and writes them into batches.

Parameters
indicesSample indices to draw from.
batch_sizeNumber of samples per batch.
shuffleWhether to shuffle indices before batching.
batchesOutput vector receiving the batched index lists.

◆ get_codification()

const Codification & opennn::Dataset::get_codification ( ) const
inlinenodiscard

Returns the configured text Codification.

◆ get_codification_string()

string opennn::Dataset::get_codification_string ( ) const
nodiscard

Returns the codification as its enumerator name.

◆ get_data() [1/2]

const MatrixR & opennn::Dataset::get_data ( ) const
inlinenodiscard

Returns the raw data matrix (rows = samples, columns = features).

◆ get_data() [2/2]

MatrixR opennn::Dataset::get_data ( const string & ,
const string &  ) const
nodiscard

Returns the data submatrix for the given sample role and feature role.

◆ get_data_file_preview()

const vector< vector< string > > & opennn::Dataset::get_data_file_preview ( ) const
inlinenodiscard

Returns the parsed preview rows captured during the last file read.

◆ get_data_from_indices()

MatrixR opennn::Dataset::get_data_from_indices ( const vector< Index > & ,
const vector< Index > &  ) const
nodiscard

Returns a submatrix from the data using explicit row and column indices.

◆ get_data_path()

const filesystem::path & opennn::Dataset::get_data_path ( ) const
inlinenodiscard

Returns the configured data file path.

◆ get_display()

bool opennn::Dataset::get_display ( ) const
inlinenodiscard

Returns whether progress messages are printed.

◆ get_feature_data()

MatrixR opennn::Dataset::get_feature_data ( const string & ) const
nodiscard

Returns the data columns belonging to features with the given role name.

◆ get_feature_dimensions()

vector< Index > opennn::Dataset::get_feature_dimensions ( ) const
nodiscard

Returns the per-variable feature dimension counts.

◆ get_feature_indices() [1/3]

vector< vector< Index > > opennn::Dataset::get_feature_indices ( ) const
nodiscard

Returns the data column indices grouped per variable.

◆ get_feature_indices() [2/3]

vector< Index > opennn::Dataset::get_feature_indices ( const Index ) const
nodiscard

Returns the data column indices that belong to the given variable index.

◆ get_feature_indices() [3/3]

vector< Index > opennn::Dataset::get_feature_indices ( const string & ) const
nodiscard

Returns the data column indices that belong to variables with the given role name.

◆ get_feature_names() [1/2]

vector< string > opennn::Dataset::get_feature_names ( ) const
nodiscard

Returns the expanded feature names (one entry per data matrix column).

◆ get_feature_names() [2/2]

vector< string > opennn::Dataset::get_feature_names ( const string & ) const
nodiscard

Returns the expanded feature names restricted to the given role.

◆ get_features_number() [1/2]

Index opennn::Dataset::get_features_number ( ) const
nodiscard

Returns the total number of features (data matrix columns).

◆ get_features_number() [2/2]

Index opennn::Dataset::get_features_number ( const string & ) const
nodiscard

Returns the number of features for variables with the given role name.

◆ get_input_shape()

Shape opennn::Dataset::get_input_shape ( ) const
inlinenodiscard

Returns the configured input tensor shape.

◆ get_sample_data()

VectorR opennn::Dataset::get_sample_data ( const Index ) const
nodiscard

Returns the row of the data matrix at the given sample index.

◆ get_sample_indices()

vector< Index > opennn::Dataset::get_sample_indices ( const string & ) const
nodiscard

Returns indices of samples with the given role name.

◆ get_sample_role_numbers()

VectorI opennn::Dataset::get_sample_role_numbers ( ) const
nodiscard

Returns the per-sample roles as a tensor of integer indices.

◆ get_sample_roles()

const vector< SampleRole > & opennn::Dataset::get_sample_roles ( ) const
inlinenodiscard

Returns the per-sample role vector.

◆ get_sample_roles_vector()

vector< Index > opennn::Dataset::get_sample_roles_vector ( ) const
nodiscard

Returns the per-sample roles as integer indices.

◆ get_samples_number() [1/2]

virtual Index opennn::Dataset::get_samples_number ( ) const
inlinenodiscardvirtual

Returns the total number of samples (rows) in the data matrix.

Reimplemented in opennn::ImageDataset, and opennn::LanguageDataset.

◆ get_samples_number() [2/2]

Index opennn::Dataset::get_samples_number ( const string & ) const
nodiscard

Returns the number of samples with the given role ("Training", "Validation", ...).

◆ get_separator()

const Separator & opennn::Dataset::get_separator ( ) const
inlinenodiscard

Returns the current field Separator.

◆ get_separator_name()

string opennn::Dataset::get_separator_name ( ) const
nodiscard

Returns the separator as its enumerator name ("Space", "Tab", ...).

◆ get_separator_string()

string opennn::Dataset::get_separator_string ( ) const
nodiscard

Returns the separator as the literal character used in files.

◆ get_shape()

Shape opennn::Dataset::get_shape ( const string & ) const
nodiscard

Returns the configured Shape for the given role ("Input", "Target", "Decoder").

◆ get_target_shape()

Shape opennn::Dataset::get_target_shape ( ) const
inlinenodiscard

Returns the configured target tensor shape.

◆ get_used_feature_indices()

vector< Index > opennn::Dataset::get_used_feature_indices ( ) const
nodiscard

Returns the data column indices that belong to in-use variables.

◆ get_used_features_number()

Index opennn::Dataset::get_used_features_number ( ) const
nodiscard

Returns the number of features for in-use variables.

◆ get_used_sample_indices()

vector< Index > opennn::Dataset::get_used_sample_indices ( ) const
nodiscard

Returns indices of all samples whose role is not None.

◆ get_used_samples_number()

Index opennn::Dataset::get_used_samples_number ( ) const
nodiscard

Returns the number of samples whose role is not None.

◆ get_used_variables_indices()

vector< Index > opennn::Dataset::get_used_variables_indices ( ) const
nodiscard

Returns the indices of all in-use variables.

◆ get_used_variables_number()

Index opennn::Dataset::get_used_variables_number ( ) const
nodiscard

Returns the number of variables whose role is in use.

◆ get_variable_data() [1/3]

MatrixR opennn::Dataset::get_variable_data ( const Index ) const
nodiscard

Returns the data columns belonging to the variable at index.

◆ get_variable_data() [2/3]

MatrixR opennn::Dataset::get_variable_data ( const Index ,
const vector< Index > &  ) const
nodiscard

Returns the variable data restricted to the given sample indices.

◆ get_variable_data() [3/3]

MatrixR opennn::Dataset::get_variable_data ( const string & ) const
nodiscard

Returns the data columns of the variable with the given name.

◆ get_variable_index() [1/2]

Index opennn::Dataset::get_variable_index ( const Index ) const
nodiscard

Returns the variable index corresponding to a flat feature index.

◆ get_variable_index() [2/2]

Index opennn::Dataset::get_variable_index ( const string & ) const
nodiscard

Returns the index of the variable with the given name.

◆ get_variable_indices()

vector< Index > opennn::Dataset::get_variable_indices ( const string & ) const
nodiscard

Returns the indices of variables matching the given role name.

◆ get_variable_names() [1/2]

vector< string > opennn::Dataset::get_variable_names ( ) const
nodiscard

Returns the names of all variables.

◆ get_variable_names() [2/2]

vector< string > opennn::Dataset::get_variable_names ( const string & ) const
nodiscard

Returns the names of variables with the given role name.

◆ get_variable_type()

VariableType opennn::Dataset::get_variable_type ( const Index index) const
inlinenodiscard

Returns the VariableType of the variable at the given index.

◆ get_variable_types()

vector< VariableType > opennn::Dataset::get_variable_types ( const vector< Index > & indices) const
nodiscard

Returns the VariableType for each of the given variable indices.

◆ get_variables() [1/2]

const vector< Variable > & opennn::Dataset::get_variables ( ) const
inlinenodiscard

Returns the variable descriptors.

◆ get_variables() [2/2]

vector< Variable > opennn::Dataset::get_variables ( const string & ) const
nodiscard

Returns the variables with the given role name.

◆ get_variables_number() [1/2]

Index opennn::Dataset::get_variables_number ( ) const
inlinenodiscard

Returns the total number of variables (columns descriptors).

◆ get_variables_number() [2/2]

Index opennn::Dataset::get_variables_number ( const string & ) const
nodiscard

Returns the number of variables with the given role name.

◆ has_binary_or_categorical_variables()

bool opennn::Dataset::has_binary_or_categorical_variables ( ) const
nodiscard

Returns true if any variable is Binary or Categorical.

◆ has_binary_variables()

bool opennn::Dataset::has_binary_variables ( ) const
nodiscard

Returns true if any variable has type Binary.

◆ has_categorical_variables()

bool opennn::Dataset::has_categorical_variables ( ) const
nodiscard

Returns true if any variable has type Categorical.

◆ has_nan()

bool opennn::Dataset::has_nan ( ) const
nodiscard

Returns true if any entry in the data matrix is NaN.

◆ has_nan_row()

bool opennn::Dataset::has_nan_row ( const Index ) const
nodiscard

Returns true if the sample at the given row contains a NaN.

◆ has_time_variable()

bool opennn::Dataset::has_time_variable ( ) const
nodiscard

Returns true if any variable has role Time.

◆ has_validation()

bool opennn::Dataset::has_validation ( ) const
nodiscard

Returns true if any sample is assigned the Validation role.

◆ infer_variable_types_from_data()

void opennn::Dataset::infer_variable_types_from_data ( )
protected

◆ is_empty()

virtual bool opennn::Dataset::is_empty ( ) const
inlinenodiscardvirtual

Returns true when the dataset contains no samples.

◆ is_sample_used()

bool opennn::Dataset::is_sample_used ( const Index i) const
inlinenodiscard

Returns true if the sample at i has a role other than None.

◆ load()

void opennn::Dataset::load ( const filesystem::path & )

Loads the dataset metadata from the JSON file at the given path.

◆ load_data_binary()

void opennn::Dataset::load_data_binary ( )

Loads the data matrix from the binary file at the configured data path.

◆ preview_data_from_JSON()

void opennn::Dataset::preview_data_from_JSON ( const Json * )
protected

◆ preview_data_to_JSON()

void opennn::Dataset::preview_data_to_JSON ( JsonWriter & ) const
protected

◆ read_data_file_preview()

void opennn::Dataset::read_data_file_preview ( const vector< vector< string_view > > & )
protected

◆ resize_input_shape()

virtual void opennn::Dataset::resize_input_shape ( Index input_features_count)
inlinevirtual

Resizes the input shape to the given flat feature count.

Reimplemented in opennn::TimeSeriesDataset.

◆ samples_from_JSON()

void opennn::Dataset::samples_from_JSON ( const Json * )
protected

◆ samples_to_JSON()

void opennn::Dataset::samples_to_JSON ( JsonWriter & ) const
protected

◆ save()

void opennn::Dataset::save ( const filesystem::path & ) const

Saves the dataset metadata to a JSON file at the given path.

◆ save_data()

void opennn::Dataset::save_data ( ) const

Saves the data matrix to the configured data path.

◆ save_data_binary()

void opennn::Dataset::save_data_binary ( const filesystem::path & ) const

Saves the data matrix to the given path in binary form.

◆ scale_features()

virtual vector< Descriptives > opennn::Dataset::scale_features ( const string & )
inlinevirtual

Scales the features with the configured method; returns the applied descriptives.

Reimplemented in opennn::ImageDataset, and opennn::TabularDataset.

◆ scrub_missing_values()

virtual void opennn::Dataset::scrub_missing_values ( )
inlinevirtual

Removes or imputes missing values using the configured strategy.

Reimplemented in opennn::TabularDataset.

◆ set()

void opennn::Dataset::set ( const Index = 0,
const Shape & = {},
const Shape & = {} )

Resets the dataset with the given sample count and input/target shapes.

Parameters
sample_countNumber of samples to allocate.
input_shapeShape of input features.
target_shapeShape of target features.

◆ set_binary_variables()

void opennn::Dataset::set_binary_variables ( )

Detects and marks variables with binary values as VariableType::Binary.

◆ set_codification() [1/2]

void opennn::Dataset::set_codification ( const Codification & new_codification)
inline

◆ set_codification() [2/2]

void opennn::Dataset::set_codification ( const string & )

Sets the codification from its enumerator name.

◆ set_data()

void opennn::Dataset::set_data ( const MatrixR & )

Replaces the underlying data matrix.

◆ set_data_constant()

void opennn::Dataset::set_data_constant ( const float )

Fills the data matrix with the given constant value.

◆ set_data_integer()

virtual void opennn::Dataset::set_data_integer ( const Index )
inlinevirtual

Fills the data matrix with random integers up to the given vocabulary size.

Reimplemented in opennn::TabularDataset.

◆ set_data_path()

void opennn::Dataset::set_data_path ( const filesystem::path & new_data_path)
inline

◆ set_data_random()

virtual void opennn::Dataset::set_data_random ( )
inlinevirtual

Fills the data matrix with random values (no-op in base class).

Reimplemented in opennn::ImageDataset, and opennn::TabularDataset.

◆ set_default()

void opennn::Dataset::set_default ( )

Restores default dataset settings.

◆ set_default_variable_names()

void opennn::Dataset::set_default_variable_names ( )

Assigns default placeholder names to all variables.

◆ set_default_variable_roles()

void opennn::Dataset::set_default_variable_roles ( )
protected

◆ set_default_variable_roles_forecasting()

void opennn::Dataset::set_default_variable_roles_forecasting ( )
protected

◆ set_display()

void opennn::Dataset::set_display ( bool new_display)
inline

◆ set_feature_names()

void opennn::Dataset::set_feature_names ( const vector< string > & )

Assigns expanded feature names, propagating categories back to variables.

◆ set_has_header()

void opennn::Dataset::set_has_header ( bool new_has_header)
inline

◆ set_has_ids()

void opennn::Dataset::set_has_ids ( bool new_has_ids)
inline

◆ set_input_variables_unused()

void opennn::Dataset::set_input_variables_unused ( )

Marks all input variables as unused (role None).

◆ set_sample_role()

void opennn::Dataset::set_sample_role ( const Index ,
const string &  )

Sets the role of a single sample by index.

◆ set_sample_roles() [1/3]

void opennn::Dataset::set_sample_roles ( const string & )

Assigns the same role to every sample.

◆ set_sample_roles() [2/3]

void opennn::Dataset::set_sample_roles ( const vector< Index > & ,
const string &  )

Assigns the same role to the given sample indices.

◆ set_sample_roles() [3/3]

void opennn::Dataset::set_sample_roles ( const vector< string > & )

Assigns a role per sample from a string vector.

◆ set_separator()

void opennn::Dataset::set_separator ( const Separator & new_separator)
inline

◆ set_separator_name()

void opennn::Dataset::set_separator_name ( const string & )

Sets the separator from its enumerator name.

◆ set_separator_string()

void opennn::Dataset::set_separator_string ( const string & )

Sets the separator from its literal character.

◆ set_shape()

void opennn::Dataset::set_shape ( const string & ,
const Shape &  )

Sets the tensor Shape associated with the given role ("Input"/"Target"/"Decoder").

◆ set_variable_indices()

void opennn::Dataset::set_variable_indices ( const vector< Index > & ,
const vector< Index > &  )

Sets which variable indices are inputs and which are targets.

Parameters
input_indicesIndices for variables to mark as Input.
target_indicesIndices for variables to mark as Target.

◆ set_variable_names()

void opennn::Dataset::set_variable_names ( const vector< string > & )

Assigns names to all variables from the given list.

◆ set_variable_role() [1/2]

void opennn::Dataset::set_variable_role ( const Index ,
const string &  )

Sets the role of the variable at the given index.

◆ set_variable_role() [2/2]

void opennn::Dataset::set_variable_role ( const string & ,
const string &  )

Sets the role of the variable with the given name.

◆ set_variable_roles() [1/2]

void opennn::Dataset::set_variable_roles ( const string & )

Assigns the same role to every variable.

◆ set_variable_roles() [2/2]

void opennn::Dataset::set_variable_roles ( const vector< string > & )

Sets the role of every variable from the given string list.

◆ set_variable_type() [1/2]

void opennn::Dataset::set_variable_type ( const Index ,
const VariableType &  )

Sets the type of the variable at the given index.

◆ set_variable_type() [2/2]

void opennn::Dataset::set_variable_type ( const string & ,
const VariableType &  )

Sets the type of the variable with the given name.

◆ set_variable_types()

void opennn::Dataset::set_variable_types ( const VariableType & )

Sets every variable to the given type.

◆ set_variables()

void opennn::Dataset::set_variables ( const vector< Variable > & new_variables)
inline

◆ set_variables_number()

void opennn::Dataset::set_variables_number ( const Index new_size)
inline

◆ split_samples() [1/2]

void opennn::Dataset::split_samples ( const float training_ratio = 0.6f,
float selection_ratio = 0.2f,
float testing_ratio = 0.2f,
bool shuffle = true )

Splits samples into Training/Validation/Testing roles, optionally shuffled.

Parameters
training_ratioFraction of samples assigned to Training.
selection_ratioFraction of samples assigned to Validation.
testing_ratioFraction of samples assigned to Testing.
shuffleIf true, shuffles samples before splitting.

◆ split_samples() [2/2]

vector< vector< Index > > opennn::Dataset::split_samples ( const vector< Index > & ,
Index  ) const
nodiscard

Splits the given indices into chunks of the requested size.

◆ split_samples_random()

void opennn::Dataset::split_samples_random ( const float training_ratio = 0.6f,
float selection_ratio = 0.2f,
float testing_ratio = 0.2f )

Splits samples randomly across roles.

◆ split_samples_sequential()

void opennn::Dataset::split_samples_sequential ( const float training_ratio = 0.6f,
float selection_ratio = 0.2f,
float testing_ratio = 0.2f )

Splits samples sequentially without shuffling.

◆ to_JSON()

virtual void opennn::Dataset::to_JSON ( JsonWriter & ) const
inlinevirtual

Writes dataset state to a JSON writer.

Reimplemented in opennn::ImageDataset, opennn::LanguageDataset, opennn::TabularDataset, and opennn::TimeSeriesDataset.

◆ unscale_features()

virtual void opennn::Dataset::unscale_features ( const string & ,
const vector< Descriptives > &  )
inlinevirtual

Reverts a previously applied scaling using the supplied descriptives.

Reimplemented in opennn::ImageDataset, and opennn::TabularDataset.

◆ variables_from_JSON()

void opennn::Dataset::variables_from_JSON ( const Json * )
protected

◆ variables_to_JSON()

void opennn::Dataset::variables_to_JSON ( JsonWriter & ) const
protected

Member Data Documentation

◆ codification

Codification opennn::Dataset::codification = Codification::UTF8
protected

◆ data

MatrixR opennn::Dataset::data
protected

◆ data_file_preview

vector<vector<string> > opennn::Dataset::data_file_preview
protected

◆ data_path

filesystem::path opennn::Dataset::data_path
protected

◆ decoder_shape

Shape opennn::Dataset::decoder_shape
protected

◆ display

bool opennn::Dataset::display = true
protected

◆ has_header

bool opennn::Dataset::has_header = false
protected

◆ has_sample_ids

bool opennn::Dataset::has_sample_ids = false
protected

◆ input_shape

Shape opennn::Dataset::input_shape
protected

◆ negative_words

const vector<string> opennn::Dataset::negative_words = {"0", "no", "negative", "-", "false", "bad", "not", "No"}
protected

◆ positive_words

const vector<string> opennn::Dataset::positive_words = {"1", "yes", "positive", "+", "true", "good", "si", "sí", "Sí"}
protected

◆ sample_ids

vector<string> opennn::Dataset::sample_ids
protected

◆ sample_roles

vector<SampleRole> opennn::Dataset::sample_roles
protected

◆ separator

Separator opennn::Dataset::separator = Separator::Comma
protected

◆ target_shape

Shape opennn::Dataset::target_shape
protected

◆ variables

vector<Variable> opennn::Dataset::variables
protected