Abstract base class for OpenNN datasets, owning samples, variables, and metadata. More...

#include <dataset.h>

Inheritance diagram for opennn::Dataset:

Public Types
enum class	Codification { UTF8 , SHIFT_JIS }
	Text encoding of the source data file. More...

enum class	Separator { Space , Tab , Comma , Semicolon }
	Field separator used when reading delimited text files. More...

Public Member Functions
virtual	~Dataset ()=default

virtual Index	get_samples_number () const
	Returns the total number of samples (rows) in the data matrix.

Index	get_samples_number (const string &) const
	Returns the number of samples with the given role ("Training", "Validation", ...).

Index	get_used_samples_number () const
	Returns the number of samples whose role is not None.

vector< Index >	get_sample_indices (const string &) const
	Returns indices of samples with the given role name.

vector< Index >	get_used_sample_indices () const
	Returns indices of all samples whose role is not None.

const vector< SampleRole > &	get_sample_roles () const
	Returns the per-sample role vector.

vector< Index >	get_sample_roles_vector () const
	Returns the per-sample roles as integer indices.

VectorI	get_sample_role_numbers () const
	Returns the per-sample roles as a tensor of integer indices.

Index	get_variables_number () const
	Returns the total number of variables (columns descriptors).

Index	get_variables_number (const string &) const
	Returns the number of variables with the given role name.

Index	get_used_variables_number () const
	Returns the number of variables whose role is in use.

const vector< Variable > &	get_variables () const
	Returns the variable descriptors.

vector< Variable >	get_variables (const string &) const
	Returns the variables with the given role name.

Index	get_variable_index (const string &) const
	Returns the index of the variable with the given name.

Index	get_variable_index (const Index) const
	Returns the variable index corresponding to a flat feature index.

vector< Index >	get_variable_indices (const string &) const
	Returns the indices of variables matching the given role name.

vector< Index >	get_used_variables_indices () const
	Returns the indices of all in-use variables.

vector< string >	get_variable_names () const
	Returns the names of all variables.

vector< string >	get_variable_names (const string &) const
	Returns the names of variables with the given role name.

VariableType	get_variable_type (const Index index) const
	Returns the VariableType of the variable at the given index.

vector< VariableType >	get_variable_types (const vector< Index > &indices) const
	Returns the VariableType for each of the given variable indices.

Index	get_features_number () const
	Returns the total number of features (data matrix columns).

Index	get_features_number (const string &) const
	Returns the number of features for variables with the given role name.

Index	get_used_features_number () const
	Returns the number of features for in-use variables.

vector< string >	get_feature_names () const
	Returns the expanded feature names (one entry per data matrix column).

vector< string >	get_feature_names (const string &) const
	Returns the expanded feature names restricted to the given role.

vector< vector< Index > >	get_feature_indices () const
	Returns the data column indices grouped per variable.

vector< Index >	get_feature_indices (const Index) const
	Returns the data column indices that belong to the given variable index.

vector< Index >	get_feature_indices (const string &) const
	Returns the data column indices that belong to variables with the given role name.

vector< Index >	get_used_feature_indices () const
	Returns the data column indices that belong to in-use variables.

vector< Index >	get_feature_dimensions () const
	Returns the per-variable feature dimension counts.

Shape	get_shape (const string &) const
	Returns the configured Shape for the given role ("Input", "Target", "Decoder").

virtual void	get_batches (const vector< Index > &, Index, bool, vector< vector< Index > > &) const
	Splits sample indices into batches and writes them into `batches`.

const vector< vector< string > > &	get_data_file_preview () const
	Returns the parsed preview rows captured during the last file read.

const filesystem::path &	get_data_path () const
	Returns the configured data file path.

const Separator &	get_separator () const
	Returns the current field Separator.

string	get_separator_string () const
	Returns the separator as the literal character used in files.

string	get_separator_name () const
	Returns the separator as its enumerator name ("Space", "Tab", ...).

const Codification &	get_codification () const
	Returns the configured text Codification.

string	get_codification_string () const
	Returns the codification as its enumerator name.

bool	get_display () const
	Returns whether progress messages are printed.

virtual bool	is_empty () const
	Returns true when the dataset contains no samples.

Shape	get_input_shape () const
	Returns the configured input tensor shape.

Shape	get_target_shape () const
	Returns the configured target tensor shape.

const MatrixR &	get_data () const
	Returns the raw data matrix (rows = samples, columns = features).

void	set_data (const MatrixR &)
	Replaces the underlying data matrix.

void	set_data_constant (const float)
	Fills the data matrix with the given constant value.

void	set_default ()
	Restores default dataset settings.

void	set_sample_roles (const string &)
	Assigns the same role to every sample.

void	set_sample_role (const Index, const string &)
	Sets the role of a single sample by index.

void	set_sample_roles (const vector< string > &)
	Assigns a role per sample from a string vector.

void	set_sample_roles (const vector< Index > &, const string &)
	Assigns the same role to the given sample indices.

void	set_variables (const vector< Variable > &new_variables)

void	set_default_variable_names ()
	Assigns default placeholder names to all variables.

void	set_variable_roles (const vector< string > &)
	Sets the role of every variable from the given string list.

void	set_variable_indices (const vector< Index > &, const vector< Index > &)
	Sets which variable indices are inputs and which are targets.

void	set_input_variables_unused ()
	Marks all input variables as unused (role None).

void	set_variable_role (const Index, const string &)
	Sets the role of the variable at the given index.

void	set_variable_role (const string &, const string &)
	Sets the role of the variable with the given name.

void	set_variable_type (const Index, const VariableType &)
	Sets the type of the variable at the given index.

void	set_variable_type (const string &, const VariableType &)
	Sets the type of the variable with the given name.

void	set_variable_types (const VariableType &)
	Sets every variable to the given type.

void	set_binary_variables ()
	Detects and marks variables with binary values as VariableType::Binary.

void	set_variable_names (const vector< string > &)
	Assigns names to all variables from the given list.

void	set_variables_number (const Index new_size)

void	set_feature_names (const vector< string > &)
	Assigns expanded feature names, propagating categories back to variables.

void	set_variable_roles (const string &)
	Assigns the same role to every variable.

void	set_shape (const string &, const Shape &)
	Sets the tensor Shape associated with the given role ("Input"/"Target"/"Decoder").

virtual void	resize_input_shape (Index input_features_count)
	Resizes the input shape to the given flat feature count.

void	set_data_path (const filesystem::path &new_data_path)

void	set_has_header (bool new_has_header)

void	set_has_ids (bool new_has_ids)

void	set_separator (const Separator &new_separator)

void	set_separator_string (const string &)
	Sets the separator from its literal character.

void	set_separator_name (const string &)
	Sets the separator from its enumerator name.

void	set_codification (const Codification &new_codification)

void	set_codification (const string &)
	Sets the codification from its enumerator name.

void	set_display (bool new_display)

bool	is_sample_used (const Index i) const
	Returns true if the sample at `i` has a role other than None.

bool	has_binary_variables () const
	Returns true if any variable has type Binary.

bool	has_categorical_variables () const
	Returns true if any variable has type Categorical.

bool	has_binary_or_categorical_variables () const
	Returns true if any variable is Binary or Categorical.

bool	has_time_variable () const
	Returns true if any variable has role Time.

bool	has_validation () const
	Returns true if any sample is assigned the Validation role.

void	split_samples (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f, bool shuffle=true)
	Splits samples into Training/Validation/Testing roles, optionally shuffled.

void	split_samples_sequential (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
	Splits samples sequentially without shuffling.

void	split_samples_random (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
	Splits samples randomly across roles.

vector< vector< Index > >	split_samples (const vector< Index > &, Index) const
	Splits the given indices into chunks of the requested size.

virtual vector< Descriptives >	scale_features (const string &)
	Scales the features with the configured method; returns the applied descriptives.

virtual void	set_data_random ()
	Fills the data matrix with random values (no-op in base class).

virtual void	set_data_integer (const Index)
	Fills the data matrix with random integers up to the given vocabulary size.

virtual void	from_JSON (const JsonDocument &)=0
	Loads dataset state from a JSON document.

virtual void	to_JSON (JsonWriter &) const
	Writes dataset state to a JSON writer.

MatrixR	get_data (const string &, const string &) const
	Returns the data submatrix for the given sample role and feature role.

MatrixR	get_data_from_indices (const vector< Index > &, const vector< Index > &) const
	Returns a submatrix from the data using explicit row and column indices.

VectorR	get_sample_data (const Index) const
	Returns the row of the data matrix at the given sample index.

MatrixR	get_variable_data (const Index) const
	Returns the data columns belonging to the variable at `index`.

MatrixR	get_variable_data (const Index, const vector< Index > &) const
	Returns the variable data restricted to the given sample indices.

MatrixR	get_variable_data (const string &) const
	Returns the data columns of the variable with the given name.

MatrixR	get_feature_data (const string &) const
	Returns the data columns belonging to features with the given role name.

void	set (const Index=0, const Shape &={}, const Shape &={})
	Resets the dataset with the given sample count and input/target shapes.

bool	has_nan () const
	Returns true if any entry in the data matrix is NaN.

bool	has_nan_row (const Index) const
	Returns true if the sample at the given row contains a NaN.

VectorI	count_nans_per_variable () const
	Returns the NaN count for each variable.

Index	count_variables_with_nan () const
	Returns the number of variables that contain at least one NaN.

Index	count_rows_with_nan () const
	Returns the number of rows that contain at least one NaN.

Index	count_nan () const
	Returns the total NaN count in the data matrix.

virtual void	scrub_missing_values ()
	Removes or imputes missing values using the configured strategy.

virtual vector< Descriptives >	calculate_feature_descriptives (const string &) const
	Returns descriptive statistics for features with the given role (subclass-specific).

virtual Tensor< Correlation, 2 >	calculate_input_target_variable_pearson_correlations () const
	Returns the Pearson correlations between input and target variables.

virtual VectorI	calculate_target_distribution () const
	Returns the distribution of target values.

virtual VectorI	calculate_correlations_rank () const
	Returns input variables ranked by absolute correlation with the target.

virtual void	unscale_features (const string &, const vector< Descriptives > &)
	Reverts a previously applied scaling using the supplied descriptives.

void	save (const filesystem::path &) const
	Saves the dataset metadata to a JSON file at the given path.

void	load (const filesystem::path &)
	Loads the dataset metadata from the JSON file at the given path.

void	save_data () const
	Saves the data matrix to the configured data path.

void	save_data_binary (const filesystem::path &) const
	Saves the data matrix to the given path in binary form.

void	load_data_binary ()
	Loads the data matrix from the binary file at the configured data path.

virtual void	fill_inputs (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const
	Copies input features of the given samples into a destination buffer.

virtual void	augment_inputs (float *, Index) const
	Applies data augmentation in place to the input buffer (no-op in base class).

virtual void	fill_decoder (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const
	Copies decoder input features into a destination buffer (sequence models).

virtual void	fill_targets (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const
	Copies target features of the given samples into a destination buffer.

Protected Member Functions
	Dataset ()=default

void	set_default_variable_roles ()

void	set_default_variable_roles_forecasting ()

void	infer_variable_types_from_data ()

void	read_data_file_preview (const vector< vector< string_view > > &)

void	check_separators (string_view) const

void	samples_from_JSON (const Json *)

void	variables_to_JSON (JsonWriter &) const

void	samples_to_JSON (JsonWriter &) const

void	preview_data_to_JSON (JsonWriter &) const

void	variables_from_JSON (const Json *)

void	preview_data_from_JSON (const Json *)

Protected Attributes
Shape	input_shape

Shape	target_shape

Shape	decoder_shape

vector< SampleRole >	sample_roles

vector< string >	sample_ids

vector< Variable >	variables

MatrixR	data

filesystem::path	data_path

Separator	separator = Separator::Comma

bool	has_header = false

bool	has_sample_ids = false

Codification	codification = Codification::UTF8

vector< vector< string > >	data_file_preview

bool	display = true

const vector< string >	positive_words = {"1", "yes", "positive", "+", "true", "good", "si", "sí", "Sí"}

const vector< string >	negative_words = {"0", "no", "negative", "-", "false", "bad", "not", "No"}

Detailed Description

Abstract base class for OpenNN datasets, owning samples, variables, and metadata.

Member Enumeration Documentation

◆ Codification

enum class opennn::Dataset::Codification

strong

Text encoding of the source data file.

Enumerator
UTF8
SHIFT_JIS

◆ Separator

enum class opennn::Dataset::Separator

strong

Field separator used when reading delimited text files.

Enumerator
Space
Tab
Comma
Semicolon

Constructor & Destructor Documentation

◆ ~Dataset()

virtual opennn::Dataset::~Dataset ( )

virtualdefault

◆ Dataset()

opennn::Dataset::Dataset ( )

protecteddefault

Member Function Documentation

◆ augment_inputs()

virtual void opennn::Dataset::augment_inputs	(	float *	,
		Index	) const

inlinevirtual

Applies data augmentation in place to the input buffer (no-op in base class).

Reimplemented in opennn::ImageDataset.

◆ calculate_correlations_rank()

virtual VectorI opennn::Dataset::calculate_correlations_rank ( ) const

inlinenodiscardvirtual

Returns input variables ranked by absolute correlation with the target.

Reimplemented in opennn::TabularDataset.

◆ calculate_feature_descriptives()

virtual vector< Descriptives > opennn::Dataset::calculate_feature_descriptives ( const string & ) const

inlinenodiscardvirtual

Returns descriptive statistics for features with the given role (subclass-specific).

Reimplemented in opennn::TabularDataset.

◆ calculate_input_target_variable_pearson_correlations()

virtual Tensor< Correlation, 2 > opennn::Dataset::calculate_input_target_variable_pearson_correlations ( ) const

inlinenodiscardvirtual

Returns the Pearson correlations between input and target variables.

Reimplemented in opennn::TabularDataset.

◆ calculate_target_distribution()

virtual VectorI opennn::Dataset::calculate_target_distribution ( ) const

inlinenodiscardvirtual

Returns the distribution of target values.

Reimplemented in opennn::LanguageDataset, and opennn::TabularDataset.

◆ check_separators()

void opennn::Dataset::check_separators ( string_view ) const

protected

◆ count_nan()

Index opennn::Dataset::count_nan ( ) const

nodiscard

Returns the total NaN count in the data matrix.

◆ count_nans_per_variable()

VectorI opennn::Dataset::count_nans_per_variable ( ) const

nodiscard

Returns the NaN count for each variable.

◆ count_rows_with_nan()

Index opennn::Dataset::count_rows_with_nan ( ) const

nodiscard

Returns the number of rows that contain at least one NaN.

◆ count_variables_with_nan()

Index opennn::Dataset::count_variables_with_nan ( ) const

nodiscard

Returns the number of variables that contain at least one NaN.

◆ fill_decoder()

virtual void opennn::Dataset::fill_decoder	(	const vector< Index > &	,
		const vector< Index > &	,
		float *	,
		bool	is_training,
		bool	parallelize = true,
		int	contiguous = -1 ) const

virtual

Copies decoder input features into a destination buffer (sequence models).

Reimplemented in opennn::LanguageDataset.

◆ fill_inputs()

virtual void opennn::Dataset::fill_inputs	(	const vector< Index > &	,
		const vector< Index > &	,
		float *	,
		bool	is_training,
		bool	parallelize = true,
		int	contiguous = -1 ) const

virtual

Copies input features of the given samples into a destination buffer.

Parameters

sample_indices	Row indices to read.
variable_indices	Variable indices selecting which features to copy.
destination	Output pointer that receives the flattened tensor.
is_training	True when the call is part of a training batch.
parallelize	If true, parallelize the copy.
contiguous	Stride hint for contiguous copies (-1 = auto).

Reimplemented in opennn::ImageDataset, opennn::LanguageDataset, and opennn::TimeSeriesDataset.

◆ fill_targets()

virtual void opennn::Dataset::fill_targets	(	const vector< Index > &	,
		const vector< Index > &	,
		float *	,
		bool	is_training,
		bool	parallelize = true,
		int	contiguous = -1 ) const

virtual

Copies target features of the given samples into a destination buffer.

Reimplemented in opennn::ImageDataset, opennn::LanguageDataset, and opennn::TimeSeriesDataset.

◆ from_JSON()

virtual void opennn::Dataset::from_JSON ( const JsonDocument & )

pure virtual

Loads dataset state from a JSON document.

Implemented in opennn::ImageDataset, opennn::LanguageDataset, opennn::TabularDataset, and opennn::TimeSeriesDataset.

◆ get_batches()

virtual void opennn::Dataset::get_batches	(	const vector< Index > &	,
		Index	,
		bool	,
		vector< vector< Index > > &	) const

virtual

Splits sample indices into batches and writes them into batches.

Parameters

indices	Sample indices to draw from.
batch_size	Number of samples per batch.
shuffle	Whether to shuffle indices before batching.
batches	Output vector receiving the batched index lists.

◆ get_codification()

const Codification & opennn::Dataset::get_codification ( ) const

inlinenodiscard

Returns the configured text Codification.

◆ get_codification_string()

string opennn::Dataset::get_codification_string ( ) const

nodiscard

Returns the codification as its enumerator name.

◆ get_data() [1/2]

const MatrixR & opennn::Dataset::get_data ( ) const

inlinenodiscard

Returns the raw data matrix (rows = samples, columns = features).

◆ get_data() [2/2]

MatrixR opennn::Dataset::get_data	(	const string &	,
		const string &	) const

nodiscard

Returns the data submatrix for the given sample role and feature role.

◆ get_data_file_preview()

const vector< vector< string > > & opennn::Dataset::get_data_file_preview ( ) const

inlinenodiscard

Returns the parsed preview rows captured during the last file read.

◆ get_data_from_indices()

MatrixR opennn::Dataset::get_data_from_indices	(	const vector< Index > &	,
		const vector< Index > &	) const

nodiscard

Returns a submatrix from the data using explicit row and column indices.

◆ get_data_path()

const filesystem::path & opennn::Dataset::get_data_path ( ) const

inlinenodiscard

Returns the configured data file path.

◆ get_display()

bool opennn::Dataset::get_display ( ) const

inlinenodiscard

Returns whether progress messages are printed.

◆ get_feature_data()

MatrixR opennn::Dataset::get_feature_data ( const string & ) const

nodiscard

Returns the data columns belonging to features with the given role name.

◆ get_feature_dimensions()

vector< Index > opennn::Dataset::get_feature_dimensions ( ) const

nodiscard

Returns the per-variable feature dimension counts.

◆ get_feature_indices() [1/3]

vector< vector< Index > > opennn::Dataset::get_feature_indices ( ) const

nodiscard

Returns the data column indices grouped per variable.

◆ get_feature_indices() [2/3]

vector< Index > opennn::Dataset::get_feature_indices ( const Index ) const

nodiscard

Returns the data column indices that belong to the given variable index.

◆ get_feature_indices() [3/3]

vector< Index > opennn::Dataset::get_feature_indices ( const string & ) const

nodiscard

Returns the data column indices that belong to variables with the given role name.

◆ get_feature_names() [1/2]

vector< string > opennn::Dataset::get_feature_names ( ) const

nodiscard

Returns the expanded feature names (one entry per data matrix column).

◆ get_feature_names() [2/2]

vector< string > opennn::Dataset::get_feature_names ( const string & ) const

nodiscard

Returns the expanded feature names restricted to the given role.

◆ get_features_number() [1/2]

Index opennn::Dataset::get_features_number ( ) const

nodiscard

Returns the total number of features (data matrix columns).

◆ get_features_number() [2/2]

Index opennn::Dataset::get_features_number ( const string & ) const

nodiscard

Returns the number of features for variables with the given role name.

◆ get_input_shape()

Shape opennn::Dataset::get_input_shape ( ) const

inlinenodiscard

Returns the configured input tensor shape.

◆ get_sample_data()

VectorR opennn::Dataset::get_sample_data ( const Index ) const

nodiscard

Returns the row of the data matrix at the given sample index.

◆ get_sample_indices()

vector< Index > opennn::Dataset::get_sample_indices ( const string & ) const

nodiscard

Returns indices of samples with the given role name.

◆ get_sample_role_numbers()

VectorI opennn::Dataset::get_sample_role_numbers ( ) const

nodiscard

Returns the per-sample roles as a tensor of integer indices.

◆ get_sample_roles()

const vector< SampleRole > & opennn::Dataset::get_sample_roles ( ) const

inlinenodiscard

Returns the per-sample role vector.

◆ get_sample_roles_vector()

vector< Index > opennn::Dataset::get_sample_roles_vector ( ) const

nodiscard

Returns the per-sample roles as integer indices.

◆ get_samples_number() [1/2]

virtual Index opennn::Dataset::get_samples_number ( ) const

inlinenodiscardvirtual

Returns the total number of samples (rows) in the data matrix.

Reimplemented in opennn::ImageDataset, and opennn::LanguageDataset.

◆ get_samples_number() [2/2]

Index opennn::Dataset::get_samples_number ( const string & ) const

nodiscard

Returns the number of samples with the given role ("Training", "Validation", ...).

◆ get_separator()

const Separator & opennn::Dataset::get_separator ( ) const

inlinenodiscard

Returns the current field Separator.

◆ get_separator_name()

string opennn::Dataset::get_separator_name ( ) const

nodiscard

Returns the separator as its enumerator name ("Space", "Tab", ...).

◆ get_separator_string()

string opennn::Dataset::get_separator_string ( ) const

nodiscard

Returns the separator as the literal character used in files.

◆ get_shape()

Shape opennn::Dataset::get_shape ( const string & ) const

nodiscard

Returns the configured Shape for the given role ("Input", "Target", "Decoder").

◆ get_target_shape()

Shape opennn::Dataset::get_target_shape ( ) const

inlinenodiscard

Returns the configured target tensor shape.

◆ get_used_feature_indices()

vector< Index > opennn::Dataset::get_used_feature_indices ( ) const

nodiscard

Returns the data column indices that belong to in-use variables.

◆ get_used_features_number()

Index opennn::Dataset::get_used_features_number ( ) const

nodiscard

Returns the number of features for in-use variables.

◆ get_used_sample_indices()

vector< Index > opennn::Dataset::get_used_sample_indices ( ) const

nodiscard

Returns indices of all samples whose role is not None.

◆ get_used_samples_number()

Index opennn::Dataset::get_used_samples_number ( ) const

nodiscard

Returns the number of samples whose role is not None.

◆ get_used_variables_indices()

vector< Index > opennn::Dataset::get_used_variables_indices ( ) const

nodiscard

Returns the indices of all in-use variables.

◆ get_used_variables_number()

Index opennn::Dataset::get_used_variables_number ( ) const

nodiscard

Returns the number of variables whose role is in use.

◆ get_variable_data() [1/3]

MatrixR opennn::Dataset::get_variable_data ( const Index ) const

nodiscard

Returns the data columns belonging to the variable at index.

◆ get_variable_data() [2/3]

MatrixR opennn::Dataset::get_variable_data	(	const Index	,
		const vector< Index > &	) const

nodiscard

Returns the variable data restricted to the given sample indices.

◆ get_variable_data() [3/3]

MatrixR opennn::Dataset::get_variable_data ( const string & ) const

nodiscard

Returns the data columns of the variable with the given name.

◆ get_variable_index() [1/2]

Index opennn::Dataset::get_variable_index ( const Index ) const

nodiscard

Returns the variable index corresponding to a flat feature index.

◆ get_variable_index() [2/2]

Index opennn::Dataset::get_variable_index ( const string & ) const

nodiscard

Returns the index of the variable with the given name.

◆ get_variable_indices()

vector< Index > opennn::Dataset::get_variable_indices ( const string & ) const

nodiscard

Returns the indices of variables matching the given role name.

◆ get_variable_names() [1/2]

vector< string > opennn::Dataset::get_variable_names ( ) const

nodiscard

Returns the names of all variables.

◆ get_variable_names() [2/2]

vector< string > opennn::Dataset::get_variable_names ( const string & ) const

nodiscard

Returns the names of variables with the given role name.

◆ get_variable_type()

VariableType opennn::Dataset::get_variable_type ( const Index index ) const

inlinenodiscard

Returns the VariableType of the variable at the given index.

◆ get_variable_types()

vector< VariableType > opennn::Dataset::get_variable_types ( const vector< Index > & indices ) const

nodiscard

Returns the VariableType for each of the given variable indices.

◆ get_variables() [1/2]

const vector< Variable > & opennn::Dataset::get_variables ( ) const

inlinenodiscard

Returns the variable descriptors.

◆ get_variables() [2/2]

vector< Variable > opennn::Dataset::get_variables ( const string & ) const

nodiscard

Returns the variables with the given role name.

◆ get_variables_number() [1/2]

Index opennn::Dataset::get_variables_number ( ) const

inlinenodiscard

Returns the total number of variables (columns descriptors).

◆ get_variables_number() [2/2]

Index opennn::Dataset::get_variables_number ( const string & ) const

nodiscard

Returns the number of variables with the given role name.

◆ has_binary_or_categorical_variables()

bool opennn::Dataset::has_binary_or_categorical_variables ( ) const

nodiscard

Returns true if any variable is Binary or Categorical.

◆ has_binary_variables()

bool opennn::Dataset::has_binary_variables ( ) const

nodiscard

Returns true if any variable has type Binary.

◆ has_categorical_variables()

bool opennn::Dataset::has_categorical_variables ( ) const

nodiscard

Returns true if any variable has type Categorical.

◆ has_nan()

bool opennn::Dataset::has_nan ( ) const

nodiscard

Returns true if any entry in the data matrix is NaN.

◆ has_nan_row()

bool opennn::Dataset::has_nan_row ( const Index ) const

nodiscard

Returns true if the sample at the given row contains a NaN.

◆ has_time_variable()

bool opennn::Dataset::has_time_variable ( ) const

nodiscard

Returns true if any variable has role Time.

◆ has_validation()

bool opennn::Dataset::has_validation ( ) const

nodiscard

Returns true if any sample is assigned the Validation role.

◆ infer_variable_types_from_data()

void opennn::Dataset::infer_variable_types_from_data ( )

protected

◆ is_empty()

virtual bool opennn::Dataset::is_empty ( ) const

inlinenodiscardvirtual

Returns true when the dataset contains no samples.

◆ is_sample_used()

bool opennn::Dataset::is_sample_used ( const Index i ) const

inlinenodiscard

Returns true if the sample at i has a role other than None.

◆ load()

void opennn::Dataset::load ( const filesystem::path & )

Loads the dataset metadata from the JSON file at the given path.

◆ load_data_binary()

void opennn::Dataset::load_data_binary ( )

Loads the data matrix from the binary file at the configured data path.

◆ preview_data_from_JSON()

void opennn::Dataset::preview_data_from_JSON ( const Json * )

protected

◆ preview_data_to_JSON()

void opennn::Dataset::preview_data_to_JSON ( JsonWriter & ) const

protected

◆ read_data_file_preview()

void opennn::Dataset::read_data_file_preview ( const vector< vector< string_view > > & )

protected

◆ resize_input_shape()

virtual void opennn::Dataset::resize_input_shape ( Index input_features_count )

inlinevirtual

Resizes the input shape to the given flat feature count.

Reimplemented in opennn::TimeSeriesDataset.

◆ samples_from_JSON()

void opennn::Dataset::samples_from_JSON ( const Json * )

protected

◆ samples_to_JSON()

void opennn::Dataset::samples_to_JSON ( JsonWriter & ) const

protected

◆ save()

void opennn::Dataset::save ( const filesystem::path & ) const

Saves the dataset metadata to a JSON file at the given path.

◆ save_data()

void opennn::Dataset::save_data ( ) const

Saves the data matrix to the configured data path.

◆ save_data_binary()

void opennn::Dataset::save_data_binary ( const filesystem::path & ) const

Saves the data matrix to the given path in binary form.

◆ scale_features()

virtual vector< Descriptives > opennn::Dataset::scale_features ( const string & )

inlinevirtual

Scales the features with the configured method; returns the applied descriptives.

Reimplemented in opennn::ImageDataset, and opennn::TabularDataset.

◆ scrub_missing_values()

virtual void opennn::Dataset::scrub_missing_values ( )

inlinevirtual

Removes or imputes missing values using the configured strategy.

Reimplemented in opennn::TabularDataset.

◆ set()

void opennn::Dataset::set	(	const Index	= 0,
		const Shape &	= {},
		const Shape &	= {} )

Resets the dataset with the given sample count and input/target shapes.

Parameters

sample_count	Number of samples to allocate.
input_shape	Shape of input features.
target_shape	Shape of target features.

◆ set_binary_variables()

void opennn::Dataset::set_binary_variables ( )

Detects and marks variables with binary values as VariableType::Binary.

◆ set_codification() [1/2]

void opennn::Dataset::set_codification ( const Codification & new_codification )

inline

◆ set_codification() [2/2]

void opennn::Dataset::set_codification ( const string & )

Sets the codification from its enumerator name.

◆ set_data()

void opennn::Dataset::set_data ( const MatrixR & )

Replaces the underlying data matrix.

◆ set_data_constant()

void opennn::Dataset::set_data_constant ( const float )

Fills the data matrix with the given constant value.

◆ set_data_integer()

virtual void opennn::Dataset::set_data_integer ( const Index )

inlinevirtual

Fills the data matrix with random integers up to the given vocabulary size.

Reimplemented in opennn::TabularDataset.

◆ set_data_path()

void opennn::Dataset::set_data_path ( const filesystem::path & new_data_path )

inline

◆ set_data_random()

virtual void opennn::Dataset::set_data_random ( )

inlinevirtual

Fills the data matrix with random values (no-op in base class).

Reimplemented in opennn::ImageDataset, and opennn::TabularDataset.

◆ set_default()

void opennn::Dataset::set_default ( )

Restores default dataset settings.

◆ set_default_variable_names()

void opennn::Dataset::set_default_variable_names ( )

Assigns default placeholder names to all variables.

◆ set_default_variable_roles()

void opennn::Dataset::set_default_variable_roles ( )

protected

◆ set_default_variable_roles_forecasting()

void opennn::Dataset::set_default_variable_roles_forecasting ( )

protected

◆ set_display()

void opennn::Dataset::set_display ( bool new_display )

inline

◆ set_feature_names()

void opennn::Dataset::set_feature_names ( const vector< string > & )

Assigns expanded feature names, propagating categories back to variables.

◆ set_has_header()

void opennn::Dataset::set_has_header ( bool new_has_header )

inline

◆ set_has_ids()

void opennn::Dataset::set_has_ids ( bool new_has_ids )

inline

◆ set_input_variables_unused()

void opennn::Dataset::set_input_variables_unused ( )

Marks all input variables as unused (role None).

◆ set_sample_role()

void opennn::Dataset::set_sample_role	(	const Index	,
		const string &	)

Sets the role of a single sample by index.

◆ set_sample_roles() [1/3]

void opennn::Dataset::set_sample_roles ( const string & )

Assigns the same role to every sample.

◆ set_sample_roles() [2/3]

void opennn::Dataset::set_sample_roles	(	const vector< Index > &	,
		const string &	)

Assigns the same role to the given sample indices.

◆ set_sample_roles() [3/3]

void opennn::Dataset::set_sample_roles ( const vector< string > & )

Assigns a role per sample from a string vector.

◆ set_separator()

void opennn::Dataset::set_separator ( const Separator & new_separator )

inline

◆ set_separator_name()

void opennn::Dataset::set_separator_name ( const string & )

Sets the separator from its enumerator name.

◆ set_separator_string()

void opennn::Dataset::set_separator_string ( const string & )

Sets the separator from its literal character.

◆ set_shape()

void opennn::Dataset::set_shape	(	const string &	,
		const Shape &	)

Sets the tensor Shape associated with the given role ("Input"/"Target"/"Decoder").

◆ set_variable_indices()

void opennn::Dataset::set_variable_indices	(	const vector< Index > &	,
		const vector< Index > &	)

Sets which variable indices are inputs and which are targets.

Parameters

input_indices	Indices for variables to mark as Input.
target_indices	Indices for variables to mark as Target.

◆ set_variable_names()

void opennn::Dataset::set_variable_names ( const vector< string > & )

Assigns names to all variables from the given list.

◆ set_variable_role() [1/2]

void opennn::Dataset::set_variable_role	(	const Index	,
		const string &	)

Sets the role of the variable at the given index.

◆ set_variable_role() [2/2]

void opennn::Dataset::set_variable_role	(	const string &	,
		const string &	)

Sets the role of the variable with the given name.

◆ set_variable_roles() [1/2]

void opennn::Dataset::set_variable_roles ( const string & )

Assigns the same role to every variable.

◆ set_variable_roles() [2/2]

void opennn::Dataset::set_variable_roles ( const vector< string > & )

Sets the role of every variable from the given string list.

◆ set_variable_type() [1/2]

void opennn::Dataset::set_variable_type	(	const Index	,
		const VariableType &	)

Sets the type of the variable at the given index.

◆ set_variable_type() [2/2]

void opennn::Dataset::set_variable_type	(	const string &	,
		const VariableType &	)

Sets the type of the variable with the given name.

◆ set_variable_types()

void opennn::Dataset::set_variable_types ( const VariableType & )

Sets every variable to the given type.

◆ set_variables()

void opennn::Dataset::set_variables ( const vector< Variable > & new_variables )

inline

◆ set_variables_number()

void opennn::Dataset::set_variables_number ( const Index new_size )

inline

◆ split_samples() [1/2]

void opennn::Dataset::split_samples	(	const float	training_ratio = 0.6f,
		float	selection_ratio = 0.2f,
		float	testing_ratio = 0.2f,
		bool	shuffle = true )

Splits samples into Training/Validation/Testing roles, optionally shuffled.

Parameters

training_ratio	Fraction of samples assigned to Training.
selection_ratio	Fraction of samples assigned to Validation.
testing_ratio	Fraction of samples assigned to Testing.
shuffle	If true, shuffles samples before splitting.

◆ split_samples() [2/2]

vector< vector< Index > > opennn::Dataset::split_samples	(	const vector< Index > &	,
		Index	) const

nodiscard

Splits the given indices into chunks of the requested size.

◆ split_samples_random()

void opennn::Dataset::split_samples_random	(	const float	training_ratio = 0.6f,
		float	selection_ratio = 0.2f,
		float	testing_ratio = 0.2f )

Splits samples randomly across roles.

◆ split_samples_sequential()

void opennn::Dataset::split_samples_sequential	(	const float	training_ratio = 0.6f,
		float	selection_ratio = 0.2f,
		float	testing_ratio = 0.2f )

Splits samples sequentially without shuffling.

◆ to_JSON()

virtual void opennn::Dataset::to_JSON ( JsonWriter & ) const

inlinevirtual

Writes dataset state to a JSON writer.

Reimplemented in opennn::ImageDataset, opennn::LanguageDataset, opennn::TabularDataset, and opennn::TimeSeriesDataset.

◆ unscale_features()

virtual void opennn::Dataset::unscale_features	(	const string &	,
		const vector< Descriptives > &	)

inlinevirtual

Reverts a previously applied scaling using the supplied descriptives.

Reimplemented in opennn::ImageDataset, and opennn::TabularDataset.

◆ variables_from_JSON()

void opennn::Dataset::variables_from_JSON ( const Json * )

protected

◆ variables_to_JSON()

void opennn::Dataset::variables_to_JSON ( JsonWriter & ) const

protected

Member Data Documentation

◆ codification

Codification opennn::Dataset::codification = Codification::UTF8

protected

◆ data

MatrixR opennn::Dataset::data

protected

◆ data_file_preview

vector<vector<string> > opennn::Dataset::data_file_preview

protected

◆ data_path

filesystem::path opennn::Dataset::data_path

protected

◆ decoder_shape

Shape opennn::Dataset::decoder_shape

protected

◆ display

bool opennn::Dataset::display = true

protected

◆ has_header

bool opennn::Dataset::has_header = false

protected

◆ has_sample_ids

bool opennn::Dataset::has_sample_ids = false

protected

◆ input_shape

Shape opennn::Dataset::input_shape

protected

◆ negative_words

const vector<string> opennn::Dataset::negative_words = {"0", "no", "negative", "-", "false", "bad", "not", "No"}

protected

◆ positive_words

const vector<string> opennn::Dataset::positive_words = {"1", "yes", "positive", "+", "true", "good", "si", "sí", "Sí"}

protected

◆ sample_ids

vector<string> opennn::Dataset::sample_ids

protected

◆ sample_roles

vector<SampleRole> opennn::Dataset::sample_roles

protected

◆ separator

Separator opennn::Dataset::separator = Separator::Comma

protected

◆ target_shape

Shape opennn::Dataset::target_shape

protected

◆ variables

vector<Variable> opennn::Dataset::variables

protected

Public Types

Public Member Functions

Protected Member Functions

Protected Attributes

Detailed Description

Member Enumeration Documentation

◆ Codification

◆ Separator

Constructor & Destructor Documentation

◆ ~Dataset()

◆ Dataset()

Member Function Documentation

◆ augment_inputs()

◆ calculate_correlations_rank()

◆ calculate_feature_descriptives()

◆ calculate_input_target_variable_pearson_correlations()

◆ calculate_target_distribution()

◆ check_separators()

◆ count_nan()

◆ count_nans_per_variable()

◆ count_rows_with_nan()

◆ count_variables_with_nan()

◆ fill_decoder()

◆ fill_inputs()

◆ fill_targets()

◆ from_JSON()

◆ get_batches()

◆ get_codification()

◆ get_codification_string()

◆ get_data() [1/2]

◆ get_data() [2/2]

◆ get_data_file_preview()

◆ get_data_from_indices()

◆ get_data_path()

◆ get_display()

◆ get_feature_data()

◆ get_feature_dimensions()

◆ get_feature_indices() [1/3]

◆ get_feature_indices() [2/3]

◆ get_feature_indices() [3/3]

◆ get_feature_names() [1/2]

◆ get_feature_names() [2/2]

◆ get_features_number() [1/2]

◆ get_features_number() [2/2]

◆ get_input_shape()

◆ get_sample_data()

◆ get_sample_indices()

◆ get_sample_role_numbers()

◆ get_sample_roles()

◆ get_sample_roles_vector()

◆ get_samples_number() [1/2]

◆ get_samples_number() [2/2]

◆ get_separator()

◆ get_separator_name()

◆ get_separator_string()

◆ get_shape()

◆ get_target_shape()

◆ get_used_feature_indices()

◆ get_used_features_number()

◆ get_used_sample_indices()

◆ get_used_samples_number()

◆ get_used_variables_indices()

◆ get_used_variables_number()

◆ get_variable_data() [1/3]

◆ get_variable_data() [2/3]

◆ get_variable_data() [3/3]

◆ get_variable_index() [1/2]

◆ get_variable_index() [2/2]

◆ get_variable_indices()

◆ get_variable_names() [1/2]

◆ get_variable_names() [2/2]

◆ get_variable_type()

◆ get_variable_types()

◆ get_variables() [1/2]

◆ get_variables() [2/2]

◆ get_variables_number() [1/2]

◆ get_variables_number() [2/2]

◆ has_binary_or_categorical_variables()

◆ has_binary_variables()

◆ has_categorical_variables()