Token-based language dataset with input/target vocabularies and binary token cache. More...

#include <language_dataset.h>

Inheritance diagram for opennn::LanguageDataset:

Public Member Functions
	LanguageDataset (const filesystem::path &="")
	Creates a language dataset, optionally reading documents from the given path.

	LanguageDataset (const filesystem::path &, Index maximum_vocabulary_size)
	Creates a language dataset capped at the given vocabulary size.

	LanguageDataset (const filesystem::path &, Index maximum_vocabulary_size, Index minimum_token_frequency)
	Creates a language dataset with vocabulary size and minimum token frequency filters.

	LanguageDataset (const Index, Index, Index)
	Creates a synthetic language dataset with the given sample count and vocabulary controls.

const vector< string > &	get_input_vocabulary () const

const vector< string > &	get_target_vocabulary () const

Index	get_input_vocabulary_size () const

Index	get_target_vocabulary_size () const

const unordered_map< string, Index > &	get_input_vocabulary_map () const

const unordered_map< string, Index > &	get_target_vocabulary_map () const

const unordered_map< Index, string > &	get_target_inverse_vocabulary_map () const

Index	get_maximum_input_sequence_length () const

Index	get_maximum_target_sequence_length () const

void	set_input_vocabulary (const vector< string > &)
	Replaces the input vocabulary and rebuilds its lookup map.

void	set_target_vocabulary (const vector< string > &)
	Replaces the target vocabulary and rebuilds its lookup maps.

void	set_maximum_vocabulary_size (Index new_maximum)

void	set_minimum_token_frequency (Index new_minimum)

Index	get_samples_number () const override
	Returns the number of token sequences in the dataset.

VectorI	calculate_target_distribution () const override
	Returns the distribution of target tokens or classes.

void	read_txt ()
	Reads the configured text corpus into the dataset and builds vocabularies.

void	create_vocabulary (const vector< vector< string_view > > &, vector< string > &) const
	Builds a vocabulary from the tokenized documents, filtered by frequency and size limits.

void	from_JSON (const JsonDocument &) override
	Loads dataset state from a JSON document.

void	to_JSON (JsonWriter &) const override
	Writes dataset state to a JSON writer.

void	fill_inputs (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override
	Streams input token indices of the selected samples into the destination buffer.

void	fill_targets (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override
	Streams target token indices of the selected samples into the destination buffer.

void	fill_decoder (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override
	Streams decoder token indices (target shifted right) into the destination buffer.

Index	get_samples_number (const string &) const
	Returns the number of samples with the given role ("Training", "Validation", ...).

Public Member Functions inherited from opennn::Dataset
virtual	~Dataset ()=default

Index	get_samples_number (const string &) const
	Returns the number of samples with the given role ("Training", "Validation", ...).

Index	get_used_samples_number () const
	Returns the number of samples whose role is not None.

vector< Index >	get_sample_indices (const string &) const
	Returns indices of samples with the given role name.

vector< Index >	get_used_sample_indices () const
	Returns indices of all samples whose role is not None.

const vector< SampleRole > &	get_sample_roles () const
	Returns the per-sample role vector.

vector< Index >	get_sample_roles_vector () const
	Returns the per-sample roles as integer indices.

VectorI	get_sample_role_numbers () const
	Returns the per-sample roles as a tensor of integer indices.

Index	get_variables_number () const
	Returns the total number of variables (columns descriptors).

Index	get_variables_number (const string &) const
	Returns the number of variables with the given role name.

Index	get_used_variables_number () const
	Returns the number of variables whose role is in use.

const vector< Variable > &	get_variables () const
	Returns the variable descriptors.

vector< Variable >	get_variables (const string &) const
	Returns the variables with the given role name.

Index	get_variable_index (const string &) const
	Returns the index of the variable with the given name.

Index	get_variable_index (const Index) const
	Returns the variable index corresponding to a flat feature index.

vector< Index >	get_variable_indices (const string &) const
	Returns the indices of variables matching the given role name.

vector< Index >	get_used_variables_indices () const
	Returns the indices of all in-use variables.

vector< string >	get_variable_names () const
	Returns the names of all variables.

vector< string >	get_variable_names (const string &) const
	Returns the names of variables with the given role name.

VariableType	get_variable_type (const Index index) const
	Returns the VariableType of the variable at the given index.

vector< VariableType >	get_variable_types (const vector< Index > &indices) const
	Returns the VariableType for each of the given variable indices.

Index	get_features_number () const
	Returns the total number of features (data matrix columns).

Index	get_features_number (const string &) const
	Returns the number of features for variables with the given role name.

Index	get_used_features_number () const
	Returns the number of features for in-use variables.

vector< string >	get_feature_names () const
	Returns the expanded feature names (one entry per data matrix column).

vector< string >	get_feature_names (const string &) const
	Returns the expanded feature names restricted to the given role.

vector< vector< Index > >	get_feature_indices () const
	Returns the data column indices grouped per variable.

vector< Index >	get_feature_indices (const Index) const
	Returns the data column indices that belong to the given variable index.

vector< Index >	get_feature_indices (const string &) const
	Returns the data column indices that belong to variables with the given role name.

vector< Index >	get_used_feature_indices () const
	Returns the data column indices that belong to in-use variables.

vector< Index >	get_feature_dimensions () const
	Returns the per-variable feature dimension counts.

Shape	get_shape (const string &) const
	Returns the configured Shape for the given role ("Input", "Target", "Decoder").

virtual void	get_batches (const vector< Index > &, Index, bool, vector< vector< Index > > &) const
	Splits sample indices into batches and writes them into `batches`.

const vector< vector< string > > &	get_data_file_preview () const
	Returns the parsed preview rows captured during the last file read.

const filesystem::path &	get_data_path () const
	Returns the configured data file path.

const Separator &	get_separator () const
	Returns the current field Separator.

string	get_separator_string () const
	Returns the separator as the literal character used in files.

string	get_separator_name () const
	Returns the separator as its enumerator name ("Space", "Tab", ...).

const Codification &	get_codification () const
	Returns the configured text Codification.

string	get_codification_string () const
	Returns the codification as its enumerator name.

bool	get_display () const
	Returns whether progress messages are printed.

virtual bool	is_empty () const
	Returns true when the dataset contains no samples.

Shape	get_input_shape () const
	Returns the configured input tensor shape.

Shape	get_target_shape () const
	Returns the configured target tensor shape.

const MatrixR &	get_data () const
	Returns the raw data matrix (rows = samples, columns = features).

void	set_data (const MatrixR &)
	Replaces the underlying data matrix.

void	set_data_constant (const float)
	Fills the data matrix with the given constant value.

void	set_default ()
	Restores default dataset settings.

void	set_sample_roles (const string &)
	Assigns the same role to every sample.

void	set_sample_role (const Index, const string &)
	Sets the role of a single sample by index.

void	set_sample_roles (const vector< string > &)
	Assigns a role per sample from a string vector.

void	set_sample_roles (const vector< Index > &, const string &)
	Assigns the same role to the given sample indices.

void	set_variables (const vector< Variable > &new_variables)

void	set_default_variable_names ()
	Assigns default placeholder names to all variables.

void	set_variable_roles (const vector< string > &)
	Sets the role of every variable from the given string list.

void	set_variable_indices (const vector< Index > &, const vector< Index > &)
	Sets which variable indices are inputs and which are targets.

void	set_input_variables_unused ()
	Marks all input variables as unused (role None).

void	set_variable_role (const Index, const string &)
	Sets the role of the variable at the given index.

void	set_variable_role (const string &, const string &)
	Sets the role of the variable with the given name.

void	set_variable_type (const Index, const VariableType &)
	Sets the type of the variable at the given index.

void	set_variable_type (const string &, const VariableType &)
	Sets the type of the variable with the given name.

void	set_variable_types (const VariableType &)
	Sets every variable to the given type.

void	set_binary_variables ()
	Detects and marks variables with binary values as VariableType::Binary.

void	set_variable_names (const vector< string > &)
	Assigns names to all variables from the given list.

void	set_variables_number (const Index new_size)

void	set_feature_names (const vector< string > &)
	Assigns expanded feature names, propagating categories back to variables.

void	set_variable_roles (const string &)
	Assigns the same role to every variable.

void	set_shape (const string &, const Shape &)
	Sets the tensor Shape associated with the given role ("Input"/"Target"/"Decoder").

virtual void	resize_input_shape (Index input_features_count)
	Resizes the input shape to the given flat feature count.

void	set_data_path (const filesystem::path &new_data_path)

void	set_has_header (bool new_has_header)

void	set_has_ids (bool new_has_ids)

void	set_separator (const Separator &new_separator)

void	set_separator_string (const string &)
	Sets the separator from its literal character.

void	set_separator_name (const string &)
	Sets the separator from its enumerator name.

void	set_codification (const Codification &new_codification)

void	set_codification (const string &)
	Sets the codification from its enumerator name.

void	set_display (bool new_display)

bool	is_sample_used (const Index i) const
	Returns true if the sample at `i` has a role other than None.

bool	has_binary_variables () const
	Returns true if any variable has type Binary.

bool	has_categorical_variables () const
	Returns true if any variable has type Categorical.

bool	has_binary_or_categorical_variables () const
	Returns true if any variable is Binary or Categorical.

bool	has_time_variable () const
	Returns true if any variable has role Time.

bool	has_validation () const
	Returns true if any sample is assigned the Validation role.

void	split_samples (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f, bool shuffle=true)
	Splits samples into Training/Validation/Testing roles, optionally shuffled.

void	split_samples_sequential (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
	Splits samples sequentially without shuffling.

void	split_samples_random (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
	Splits samples randomly across roles.

vector< vector< Index > >	split_samples (const vector< Index > &, Index) const
	Splits the given indices into chunks of the requested size.

virtual vector< Descriptives >	scale_features (const string &)
	Scales the features with the configured method; returns the applied descriptives.

virtual void	set_data_random ()
	Fills the data matrix with random values (no-op in base class).

virtual void	set_data_integer (const Index)
	Fills the data matrix with random integers up to the given vocabulary size.

MatrixR	get_data (const string &, const string &) const
	Returns the data submatrix for the given sample role and feature role.

MatrixR	get_data_from_indices (const vector< Index > &, const vector< Index > &) const
	Returns a submatrix from the data using explicit row and column indices.

VectorR	get_sample_data (const Index) const
	Returns the row of the data matrix at the given sample index.

MatrixR	get_variable_data (const Index) const
	Returns the data columns belonging to the variable at `index`.

MatrixR	get_variable_data (const Index, const vector< Index > &) const
	Returns the variable data restricted to the given sample indices.

MatrixR	get_variable_data (const string &) const
	Returns the data columns of the variable with the given name.

MatrixR	get_feature_data (const string &) const
	Returns the data columns belonging to features with the given role name.

void	set (const Index=0, const Shape &={}, const Shape &={})
	Resets the dataset with the given sample count and input/target shapes.

bool	has_nan () const
	Returns true if any entry in the data matrix is NaN.

bool	has_nan_row (const Index) const
	Returns true if the sample at the given row contains a NaN.

VectorI	count_nans_per_variable () const
	Returns the NaN count for each variable.

Index	count_variables_with_nan () const
	Returns the number of variables that contain at least one NaN.

Index	count_rows_with_nan () const
	Returns the number of rows that contain at least one NaN.

Index	count_nan () const
	Returns the total NaN count in the data matrix.

virtual void	scrub_missing_values ()
	Removes or imputes missing values using the configured strategy.

virtual vector< Descriptives >	calculate_feature_descriptives (const string &) const
	Returns descriptive statistics for features with the given role (subclass-specific).

virtual Tensor< Correlation, 2 >	calculate_input_target_variable_pearson_correlations () const
	Returns the Pearson correlations between input and target variables.

virtual VectorI	calculate_correlations_rank () const
	Returns input variables ranked by absolute correlation with the target.

virtual void	unscale_features (const string &, const vector< Descriptives > &)
	Reverts a previously applied scaling using the supplied descriptives.

void	save (const filesystem::path &) const
	Saves the dataset metadata to a JSON file at the given path.

void	load (const filesystem::path &)
	Loads the dataset metadata from the JSON file at the given path.

void	save_data () const
	Saves the data matrix to the configured data path.

void	save_data_binary (const filesystem::path &) const
	Saves the data matrix to the given path in binary form.

void	load_data_binary ()
	Loads the data matrix from the binary file at the configured data path.

virtual void	augment_inputs (float *, Index) const
	Applies data augmentation in place to the input buffer (no-op in base class).

Static Public Attributes
static const string	PAD_TOKEN = "[PAD]"

static const string	UNK_TOKEN = "[UNK]"

static const string	START_TOKEN = "[START]"

static const string	END_TOKEN = "[END]"

static const float	UNK_INDEX = 1.0f

static const float	START_INDEX = 2.0f

static const float	END_INDEX = 3.0f

static const vector< string >	reserved_tokens = {PAD_TOKEN, UNK_TOKEN, START_TOKEN, END_TOKEN}

Additional Inherited Members
Public Types inherited from opennn::Dataset
enum class	Codification { UTF8 , SHIFT_JIS }
	Text encoding of the source data file. More...

enum class	Separator { Space , Tab , Comma , Semicolon }
	Field separator used when reading delimited text files. More...

Protected Member Functions inherited from opennn::Dataset
	Dataset ()=default

void	set_default_variable_roles ()

void	set_default_variable_roles_forecasting ()

void	infer_variable_types_from_data ()

void	read_data_file_preview (const vector< vector< string_view > > &)

void	check_separators (string_view) const

void	samples_from_JSON (const Json *)

void	variables_to_JSON (JsonWriter &) const

void	samples_to_JSON (JsonWriter &) const

void	preview_data_to_JSON (JsonWriter &) const

void	variables_from_JSON (const Json *)

void	preview_data_from_JSON (const Json *)

Protected Attributes inherited from opennn::Dataset
Shape	input_shape

Shape	target_shape

Shape	decoder_shape

vector< SampleRole >	sample_roles

vector< string >	sample_ids

vector< Variable >	variables

MatrixR	data

filesystem::path	data_path

Separator	separator = Separator::Comma

bool	has_header = false

bool	has_sample_ids = false

Codification	codification = Codification::UTF8

vector< vector< string > >	data_file_preview

bool	display = true

const vector< string >	positive_words = {"1", "yes", "positive", "+", "true", "good", "si", "sí", "Sí"}

const vector< string >	negative_words = {"0", "no", "negative", "-", "false", "bad", "not", "No"}

Detailed Description

Token-based language dataset with input/target vocabularies and binary token cache.

Constructor & Destructor Documentation

◆ LanguageDataset() [1/4]

opennn::LanguageDataset::LanguageDataset ( const filesystem::path & = "" )

Creates a language dataset, optionally reading documents from the given path.

◆ LanguageDataset() [2/4]

opennn::LanguageDataset::LanguageDataset	(	const filesystem::path &	,
		Index	maximum_vocabulary_size )

Creates a language dataset capped at the given vocabulary size.

◆ LanguageDataset() [3/4]

opennn::LanguageDataset::LanguageDataset	(	const filesystem::path &	,
		Index	maximum_vocabulary_size,
		Index	minimum_token_frequency )

Creates a language dataset with vocabulary size and minimum token frequency filters.

◆ LanguageDataset() [4/4]

opennn::LanguageDataset::LanguageDataset	(	const Index	,
		Index	,
		Index	)

Creates a synthetic language dataset with the given sample count and vocabulary controls.

Member Function Documentation

◆ calculate_target_distribution()

VectorI opennn::LanguageDataset::calculate_target_distribution ( ) const

overridevirtual

Returns the distribution of target tokens or classes.

Reimplemented from opennn::Dataset.

◆ create_vocabulary()

void opennn::LanguageDataset::create_vocabulary	(	const vector< vector< string_view > > &	,
		vector< string > &	) const

Builds a vocabulary from the tokenized documents, filtered by frequency and size limits.

Parameters

documents	Tokenized input or target documents.
vocabulary	Output vector receiving the resulting vocabulary.

◆ fill_decoder()

void opennn::LanguageDataset::fill_decoder	(	const vector< Index > &	,
		const vector< Index > &	,
		float *	,
		bool	is_training,
		bool	parallelize = true,
		int	= -1 ) const

overridevirtual

Streams decoder token indices (target shifted right) into the destination buffer.

Reimplemented from opennn::Dataset.

◆ fill_inputs()

void opennn::LanguageDataset::fill_inputs	(	const vector< Index > &	,
		const vector< Index > &	,
		float *	,
		bool	is_training,
		bool	parallelize = true,
		int	= -1 ) const

overridevirtual

Streams input token indices of the selected samples into the destination buffer.

Reimplemented from opennn::Dataset.

◆ fill_targets()

void opennn::LanguageDataset::fill_targets	(	const vector< Index > &	,
		const vector< Index > &	,
		float *	,
		bool	is_training,
		bool	parallelize = true,
		int	= -1 ) const

overridevirtual

Streams target token indices of the selected samples into the destination buffer.

Reimplemented from opennn::Dataset.

◆ from_JSON()

void opennn::LanguageDataset::from_JSON ( const JsonDocument & )

overridevirtual

Loads dataset state from a JSON document.

Implements opennn::Dataset.

◆ get_input_vocabulary()

const vector< string > & opennn::LanguageDataset::get_input_vocabulary ( ) const

inline

◆ get_input_vocabulary_map()

const unordered_map< string, Index > & opennn::LanguageDataset::get_input_vocabulary_map ( ) const

inline

◆ get_input_vocabulary_size()

Index opennn::LanguageDataset::get_input_vocabulary_size ( ) const

inline

◆ get_maximum_input_sequence_length()

Index opennn::LanguageDataset::get_maximum_input_sequence_length ( ) const

inline

◆ get_maximum_target_sequence_length()

Index opennn::LanguageDataset::get_maximum_target_sequence_length ( ) const

inline

◆ get_samples_number() [1/2]

Index opennn::LanguageDataset::get_samples_number ( ) const

overridevirtual

Returns the number of token sequences in the dataset.

Reimplemented from opennn::Dataset.

◆ get_samples_number() [2/2]

Index opennn::Dataset::get_samples_number ( const string & ) const

nodiscard

Returns the number of samples with the given role ("Training", "Validation", ...).

◆ get_target_inverse_vocabulary_map()

const unordered_map< Index, string > & opennn::LanguageDataset::get_target_inverse_vocabulary_map ( ) const

inline

◆ get_target_vocabulary()

const vector< string > & opennn::LanguageDataset::get_target_vocabulary ( ) const

inline

◆ get_target_vocabulary_map()

const unordered_map< string, Index > & opennn::LanguageDataset::get_target_vocabulary_map ( ) const

inline

◆ get_target_vocabulary_size()

Index opennn::LanguageDataset::get_target_vocabulary_size ( ) const

inline

◆ read_txt()

void opennn::LanguageDataset::read_txt ( )

Reads the configured text corpus into the dataset and builds vocabularies.

◆ set_input_vocabulary()

void opennn::LanguageDataset::set_input_vocabulary ( const vector< string > & )

Replaces the input vocabulary and rebuilds its lookup map.

◆ set_maximum_vocabulary_size()

void opennn::LanguageDataset::set_maximum_vocabulary_size ( Index new_maximum )

inline

◆ set_minimum_token_frequency()

void opennn::LanguageDataset::set_minimum_token_frequency ( Index new_minimum )

inline

◆ set_target_vocabulary()

void opennn::LanguageDataset::set_target_vocabulary ( const vector< string > & )

Replaces the target vocabulary and rebuilds its lookup maps.

◆ to_JSON()

void opennn::LanguageDataset::to_JSON ( JsonWriter & ) const

overridevirtual

Writes dataset state to a JSON writer.

Reimplemented from opennn::Dataset.

Member Data Documentation

◆ END_INDEX

const float opennn::LanguageDataset::END_INDEX = 3.0f

inlinestatic

◆ END_TOKEN

const string opennn::LanguageDataset::END_TOKEN = "[END]"

inlinestatic

◆ PAD_TOKEN

const string opennn::LanguageDataset::PAD_TOKEN = "[PAD]"

inlinestatic

◆ reserved_tokens

const vector<string> opennn::LanguageDataset::reserved_tokens = {PAD_TOKEN, UNK_TOKEN, START_TOKEN, END_TOKEN}

inlinestatic

◆ START_INDEX

const float opennn::LanguageDataset::START_INDEX = 2.0f

inlinestatic

◆ START_TOKEN

const string opennn::LanguageDataset::START_TOKEN = "[START]"

inlinestatic

◆ UNK_INDEX

const float opennn::LanguageDataset::UNK_INDEX = 1.0f

inlinestatic

◆ UNK_TOKEN

const string opennn::LanguageDataset::UNK_TOKEN = "[UNK]"

inlinestatic

Public Member Functions

Static Public Attributes

Additional Inherited Members

Detailed Description

Constructor & Destructor Documentation

◆ LanguageDataset() [1/4]

◆ LanguageDataset() [2/4]

◆ LanguageDataset() [3/4]

◆ LanguageDataset() [4/4]

Member Function Documentation

◆ calculate_target_distribution()

◆ create_vocabulary()

◆ fill_decoder()

◆ fill_inputs()

◆ fill_targets()

◆ from_JSON()

◆ get_input_vocabulary()

◆ get_input_vocabulary_map()

◆ get_input_vocabulary_size()

◆ get_maximum_input_sequence_length()

◆ get_maximum_target_sequence_length()

◆ get_samples_number() [1/2]

◆ get_samples_number() [2/2]

◆ get_target_inverse_vocabulary_map()

◆ get_target_vocabulary()

◆ get_target_vocabulary_map()

◆ get_target_vocabulary_size()

◆ read_txt()

◆ set_input_vocabulary()

◆ set_maximum_vocabulary_size()

◆ set_minimum_token_frequency()

◆ set_target_vocabulary()

◆ to_JSON()

Member Data Documentation

◆ END_INDEX

◆ END_TOKEN

◆ PAD_TOKEN

◆ reserved_tokens

◆ START_INDEX

◆ START_TOKEN

◆ UNK_INDEX

◆ UNK_TOKEN