OpenNN
Open-source neural networks library
Loading...
Searching...
No Matches
opennn::LanguageDataset Class Referencefinal

Token-based language dataset with input/target vocabularies and binary token cache. More...

#include <language_dataset.h>

Inheritance diagram for opennn::LanguageDataset:
[legend]

Public Member Functions

 LanguageDataset (const filesystem::path &="")
 Creates a language dataset, optionally reading documents from the given path.
 
 LanguageDataset (const filesystem::path &, Index maximum_vocabulary_size)
 Creates a language dataset capped at the given vocabulary size.
 
 LanguageDataset (const filesystem::path &, Index maximum_vocabulary_size, Index minimum_token_frequency)
 Creates a language dataset with vocabulary size and minimum token frequency filters.
 
 LanguageDataset (const Index, Index, Index)
 Creates a synthetic language dataset with the given sample count and vocabulary controls.
 
const vector< string > & get_input_vocabulary () const
 
const vector< string > & get_target_vocabulary () const
 
Index get_input_vocabulary_size () const
 
Index get_target_vocabulary_size () const
 
const unordered_map< string, Index > & get_input_vocabulary_map () const
 
const unordered_map< string, Index > & get_target_vocabulary_map () const
 
const unordered_map< Index, string > & get_target_inverse_vocabulary_map () const
 
Index get_maximum_input_sequence_length () const
 
Index get_maximum_target_sequence_length () const
 
void set_input_vocabulary (const vector< string > &)
 Replaces the input vocabulary and rebuilds its lookup map.
 
void set_target_vocabulary (const vector< string > &)
 Replaces the target vocabulary and rebuilds its lookup maps.
 
void set_maximum_vocabulary_size (Index new_maximum)
 
void set_minimum_token_frequency (Index new_minimum)
 
Index get_samples_number () const override
 Returns the number of token sequences in the dataset.
 
VectorI calculate_target_distribution () const override
 Returns the distribution of target tokens or classes.
 
void read_txt ()
 Reads the configured text corpus into the dataset and builds vocabularies.
 
void create_vocabulary (const vector< vector< string_view > > &, vector< string > &) const
 Builds a vocabulary from the tokenized documents, filtered by frequency and size limits.
 
void from_JSON (const JsonDocument &) override
 Loads dataset state from a JSON document.
 
void to_JSON (JsonWriter &) const override
 Writes dataset state to a JSON writer.
 
void fill_inputs (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override
 Streams input token indices of the selected samples into the destination buffer.
 
void fill_targets (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override
 Streams target token indices of the selected samples into the destination buffer.
 
void fill_decoder (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override
 Streams decoder token indices (target shifted right) into the destination buffer.
 
Index get_samples_number (const string &) const
 Returns the number of samples with the given role ("Training", "Validation", ...).
 
- Public Member Functions inherited from opennn::Dataset
virtual ~Dataset ()=default
 
Index get_samples_number (const string &) const
 Returns the number of samples with the given role ("Training", "Validation", ...).
 
Index get_used_samples_number () const
 Returns the number of samples whose role is not None.
 
vector< Index > get_sample_indices (const string &) const
 Returns indices of samples with the given role name.
 
vector< Index > get_used_sample_indices () const
 Returns indices of all samples whose role is not None.
 
const vector< SampleRole > & get_sample_roles () const
 Returns the per-sample role vector.
 
vector< Index > get_sample_roles_vector () const
 Returns the per-sample roles as integer indices.
 
VectorI get_sample_role_numbers () const
 Returns the per-sample roles as a tensor of integer indices.
 
Index get_variables_number () const
 Returns the total number of variables (columns descriptors).
 
Index get_variables_number (const string &) const
 Returns the number of variables with the given role name.
 
Index get_used_variables_number () const
 Returns the number of variables whose role is in use.
 
const vector< Variable > & get_variables () const
 Returns the variable descriptors.
 
vector< Variableget_variables (const string &) const
 Returns the variables with the given role name.
 
Index get_variable_index (const string &) const
 Returns the index of the variable with the given name.
 
Index get_variable_index (const Index) const
 Returns the variable index corresponding to a flat feature index.
 
vector< Index > get_variable_indices (const string &) const
 Returns the indices of variables matching the given role name.
 
vector< Index > get_used_variables_indices () const
 Returns the indices of all in-use variables.
 
vector< string > get_variable_names () const
 Returns the names of all variables.
 
vector< string > get_variable_names (const string &) const
 Returns the names of variables with the given role name.
 
VariableType get_variable_type (const Index index) const
 Returns the VariableType of the variable at the given index.
 
vector< VariableTypeget_variable_types (const vector< Index > &indices) const
 Returns the VariableType for each of the given variable indices.
 
Index get_features_number () const
 Returns the total number of features (data matrix columns).
 
Index get_features_number (const string &) const
 Returns the number of features for variables with the given role name.
 
Index get_used_features_number () const
 Returns the number of features for in-use variables.
 
vector< string > get_feature_names () const
 Returns the expanded feature names (one entry per data matrix column).
 
vector< string > get_feature_names (const string &) const
 Returns the expanded feature names restricted to the given role.
 
vector< vector< Index > > get_feature_indices () const
 Returns the data column indices grouped per variable.
 
vector< Index > get_feature_indices (const Index) const
 Returns the data column indices that belong to the given variable index.
 
vector< Index > get_feature_indices (const string &) const
 Returns the data column indices that belong to variables with the given role name.
 
vector< Index > get_used_feature_indices () const
 Returns the data column indices that belong to in-use variables.
 
vector< Index > get_feature_dimensions () const
 Returns the per-variable feature dimension counts.
 
Shape get_shape (const string &) const
 Returns the configured Shape for the given role ("Input", "Target", "Decoder").
 
virtual void get_batches (const vector< Index > &, Index, bool, vector< vector< Index > > &) const
 Splits sample indices into batches and writes them into batches.
 
const vector< vector< string > > & get_data_file_preview () const
 Returns the parsed preview rows captured during the last file read.
 
const filesystem::path & get_data_path () const
 Returns the configured data file path.
 
const Separatorget_separator () const
 Returns the current field Separator.
 
string get_separator_string () const
 Returns the separator as the literal character used in files.
 
string get_separator_name () const
 Returns the separator as its enumerator name ("Space", "Tab", ...).
 
const Codificationget_codification () const
 Returns the configured text Codification.
 
string get_codification_string () const
 Returns the codification as its enumerator name.
 
bool get_display () const
 Returns whether progress messages are printed.
 
virtual bool is_empty () const
 Returns true when the dataset contains no samples.
 
Shape get_input_shape () const
 Returns the configured input tensor shape.
 
Shape get_target_shape () const
 Returns the configured target tensor shape.
 
const MatrixRget_data () const
 Returns the raw data matrix (rows = samples, columns = features).
 
void set_data (const MatrixR &)
 Replaces the underlying data matrix.
 
void set_data_constant (const float)
 Fills the data matrix with the given constant value.
 
void set_default ()
 Restores default dataset settings.
 
void set_sample_roles (const string &)
 Assigns the same role to every sample.
 
void set_sample_role (const Index, const string &)
 Sets the role of a single sample by index.
 
void set_sample_roles (const vector< string > &)
 Assigns a role per sample from a string vector.
 
void set_sample_roles (const vector< Index > &, const string &)
 Assigns the same role to the given sample indices.
 
void set_variables (const vector< Variable > &new_variables)
 
void set_default_variable_names ()
 Assigns default placeholder names to all variables.
 
void set_variable_roles (const vector< string > &)
 Sets the role of every variable from the given string list.
 
void set_variable_indices (const vector< Index > &, const vector< Index > &)
 Sets which variable indices are inputs and which are targets.
 
void set_input_variables_unused ()
 Marks all input variables as unused (role None).
 
void set_variable_role (const Index, const string &)
 Sets the role of the variable at the given index.
 
void set_variable_role (const string &, const string &)
 Sets the role of the variable with the given name.
 
void set_variable_type (const Index, const VariableType &)
 Sets the type of the variable at the given index.
 
void set_variable_type (const string &, const VariableType &)
 Sets the type of the variable with the given name.
 
void set_variable_types (const VariableType &)
 Sets every variable to the given type.
 
void set_binary_variables ()
 Detects and marks variables with binary values as VariableType::Binary.
 
void set_variable_names (const vector< string > &)
 Assigns names to all variables from the given list.
 
void set_variables_number (const Index new_size)
 
void set_feature_names (const vector< string > &)
 Assigns expanded feature names, propagating categories back to variables.
 
void set_variable_roles (const string &)
 Assigns the same role to every variable.
 
void set_shape (const string &, const Shape &)
 Sets the tensor Shape associated with the given role ("Input"/"Target"/"Decoder").
 
virtual void resize_input_shape (Index input_features_count)
 Resizes the input shape to the given flat feature count.
 
void set_data_path (const filesystem::path &new_data_path)
 
void set_has_header (bool new_has_header)
 
void set_has_ids (bool new_has_ids)
 
void set_separator (const Separator &new_separator)
 
void set_separator_string (const string &)
 Sets the separator from its literal character.
 
void set_separator_name (const string &)
 Sets the separator from its enumerator name.
 
void set_codification (const Codification &new_codification)
 
void set_codification (const string &)
 Sets the codification from its enumerator name.
 
void set_display (bool new_display)
 
bool is_sample_used (const Index i) const
 Returns true if the sample at i has a role other than None.
 
bool has_binary_variables () const
 Returns true if any variable has type Binary.
 
bool has_categorical_variables () const
 Returns true if any variable has type Categorical.
 
bool has_binary_or_categorical_variables () const
 Returns true if any variable is Binary or Categorical.
 
bool has_time_variable () const
 Returns true if any variable has role Time.
 
bool has_validation () const
 Returns true if any sample is assigned the Validation role.
 
void split_samples (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f, bool shuffle=true)
 Splits samples into Training/Validation/Testing roles, optionally shuffled.
 
void split_samples_sequential (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
 Splits samples sequentially without shuffling.
 
void split_samples_random (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
 Splits samples randomly across roles.
 
vector< vector< Index > > split_samples (const vector< Index > &, Index) const
 Splits the given indices into chunks of the requested size.
 
virtual vector< Descriptivesscale_features (const string &)
 Scales the features with the configured method; returns the applied descriptives.
 
virtual void set_data_random ()
 Fills the data matrix with random values (no-op in base class).
 
virtual void set_data_integer (const Index)
 Fills the data matrix with random integers up to the given vocabulary size.
 
MatrixR get_data (const string &, const string &) const
 Returns the data submatrix for the given sample role and feature role.
 
MatrixR get_data_from_indices (const vector< Index > &, const vector< Index > &) const
 Returns a submatrix from the data using explicit row and column indices.
 
VectorR get_sample_data (const Index) const
 Returns the row of the data matrix at the given sample index.
 
MatrixR get_variable_data (const Index) const
 Returns the data columns belonging to the variable at index.
 
MatrixR get_variable_data (const Index, const vector< Index > &) const
 Returns the variable data restricted to the given sample indices.
 
MatrixR get_variable_data (const string &) const
 Returns the data columns of the variable with the given name.
 
MatrixR get_feature_data (const string &) const
 Returns the data columns belonging to features with the given role name.
 
void set (const Index=0, const Shape &={}, const Shape &={})
 Resets the dataset with the given sample count and input/target shapes.
 
bool has_nan () const
 Returns true if any entry in the data matrix is NaN.
 
bool has_nan_row (const Index) const
 Returns true if the sample at the given row contains a NaN.
 
VectorI count_nans_per_variable () const
 Returns the NaN count for each variable.
 
Index count_variables_with_nan () const
 Returns the number of variables that contain at least one NaN.
 
Index count_rows_with_nan () const
 Returns the number of rows that contain at least one NaN.
 
Index count_nan () const
 Returns the total NaN count in the data matrix.
 
virtual void scrub_missing_values ()
 Removes or imputes missing values using the configured strategy.
 
virtual vector< Descriptivescalculate_feature_descriptives (const string &) const
 Returns descriptive statistics for features with the given role (subclass-specific).
 
virtual Tensor< Correlation, 2 > calculate_input_target_variable_pearson_correlations () const
 Returns the Pearson correlations between input and target variables.
 
virtual VectorI calculate_correlations_rank () const
 Returns input variables ranked by absolute correlation with the target.
 
virtual void unscale_features (const string &, const vector< Descriptives > &)
 Reverts a previously applied scaling using the supplied descriptives.
 
void save (const filesystem::path &) const
 Saves the dataset metadata to a JSON file at the given path.
 
void load (const filesystem::path &)
 Loads the dataset metadata from the JSON file at the given path.
 
void save_data () const
 Saves the data matrix to the configured data path.
 
void save_data_binary (const filesystem::path &) const
 Saves the data matrix to the given path in binary form.
 
void load_data_binary ()
 Loads the data matrix from the binary file at the configured data path.
 
virtual void augment_inputs (float *, Index) const
 Applies data augmentation in place to the input buffer (no-op in base class).
 

Static Public Attributes

static const string PAD_TOKEN = "[PAD]"
 
static const string UNK_TOKEN = "[UNK]"
 
static const string START_TOKEN = "[START]"
 
static const string END_TOKEN = "[END]"
 
static const float UNK_INDEX = 1.0f
 
static const float START_INDEX = 2.0f
 
static const float END_INDEX = 3.0f
 
static const vector< string > reserved_tokens = {PAD_TOKEN, UNK_TOKEN, START_TOKEN, END_TOKEN}
 

Additional Inherited Members

- Public Types inherited from opennn::Dataset
enum class  Codification { UTF8 , SHIFT_JIS }
 Text encoding of the source data file. More...
 
enum class  Separator { Space , Tab , Comma , Semicolon }
 Field separator used when reading delimited text files. More...
 
- Protected Member Functions inherited from opennn::Dataset
 Dataset ()=default
 
void set_default_variable_roles ()
 
void set_default_variable_roles_forecasting ()
 
void infer_variable_types_from_data ()
 
void read_data_file_preview (const vector< vector< string_view > > &)
 
void check_separators (string_view) const
 
void samples_from_JSON (const Json *)
 
void variables_to_JSON (JsonWriter &) const
 
void samples_to_JSON (JsonWriter &) const
 
void preview_data_to_JSON (JsonWriter &) const
 
void variables_from_JSON (const Json *)
 
void preview_data_from_JSON (const Json *)
 
- Protected Attributes inherited from opennn::Dataset
Shape input_shape
 
Shape target_shape
 
Shape decoder_shape
 
vector< SampleRolesample_roles
 
vector< string > sample_ids
 
vector< Variablevariables
 
MatrixR data
 
filesystem::path data_path
 
Separator separator = Separator::Comma
 
bool has_header = false
 
bool has_sample_ids = false
 
Codification codification = Codification::UTF8
 
vector< vector< string > > data_file_preview
 
bool display = true
 
const vector< string > positive_words = {"1", "yes", "positive", "+", "true", "good", "si", "sí", "Sí"}
 
const vector< string > negative_words = {"0", "no", "negative", "-", "false", "bad", "not", "No"}
 

Detailed Description

Token-based language dataset with input/target vocabularies and binary token cache.

Constructor & Destructor Documentation

◆ LanguageDataset() [1/4]

opennn::LanguageDataset::LanguageDataset ( const filesystem::path & = "")

Creates a language dataset, optionally reading documents from the given path.

◆ LanguageDataset() [2/4]

opennn::LanguageDataset::LanguageDataset ( const filesystem::path & ,
Index maximum_vocabulary_size )

Creates a language dataset capped at the given vocabulary size.

◆ LanguageDataset() [3/4]

opennn::LanguageDataset::LanguageDataset ( const filesystem::path & ,
Index maximum_vocabulary_size,
Index minimum_token_frequency )

Creates a language dataset with vocabulary size and minimum token frequency filters.

◆ LanguageDataset() [4/4]

opennn::LanguageDataset::LanguageDataset ( const Index ,
Index ,
Index  )

Creates a synthetic language dataset with the given sample count and vocabulary controls.

Member Function Documentation

◆ calculate_target_distribution()

VectorI opennn::LanguageDataset::calculate_target_distribution ( ) const
overridevirtual

Returns the distribution of target tokens or classes.

Reimplemented from opennn::Dataset.

◆ create_vocabulary()

void opennn::LanguageDataset::create_vocabulary ( const vector< vector< string_view > > & ,
vector< string > &  ) const

Builds a vocabulary from the tokenized documents, filtered by frequency and size limits.

Parameters
documentsTokenized input or target documents.
vocabularyOutput vector receiving the resulting vocabulary.

◆ fill_decoder()

void opennn::LanguageDataset::fill_decoder ( const vector< Index > & ,
const vector< Index > & ,
float * ,
bool is_training,
bool parallelize = true,
int = -1 ) const
overridevirtual

Streams decoder token indices (target shifted right) into the destination buffer.

Reimplemented from opennn::Dataset.

◆ fill_inputs()

void opennn::LanguageDataset::fill_inputs ( const vector< Index > & ,
const vector< Index > & ,
float * ,
bool is_training,
bool parallelize = true,
int = -1 ) const
overridevirtual

Streams input token indices of the selected samples into the destination buffer.

Reimplemented from opennn::Dataset.

◆ fill_targets()

void opennn::LanguageDataset::fill_targets ( const vector< Index > & ,
const vector< Index > & ,
float * ,
bool is_training,
bool parallelize = true,
int = -1 ) const
overridevirtual

Streams target token indices of the selected samples into the destination buffer.

Reimplemented from opennn::Dataset.

◆ from_JSON()

void opennn::LanguageDataset::from_JSON ( const JsonDocument & )
overridevirtual

Loads dataset state from a JSON document.

Implements opennn::Dataset.

◆ get_input_vocabulary()

const vector< string > & opennn::LanguageDataset::get_input_vocabulary ( ) const
inline

◆ get_input_vocabulary_map()

const unordered_map< string, Index > & opennn::LanguageDataset::get_input_vocabulary_map ( ) const
inline

◆ get_input_vocabulary_size()

Index opennn::LanguageDataset::get_input_vocabulary_size ( ) const
inline

◆ get_maximum_input_sequence_length()

Index opennn::LanguageDataset::get_maximum_input_sequence_length ( ) const
inline

◆ get_maximum_target_sequence_length()

Index opennn::LanguageDataset::get_maximum_target_sequence_length ( ) const
inline

◆ get_samples_number() [1/2]

Index opennn::LanguageDataset::get_samples_number ( ) const
overridevirtual

Returns the number of token sequences in the dataset.

Reimplemented from opennn::Dataset.

◆ get_samples_number() [2/2]

Index opennn::Dataset::get_samples_number ( const string & ) const
nodiscard

Returns the number of samples with the given role ("Training", "Validation", ...).

◆ get_target_inverse_vocabulary_map()

const unordered_map< Index, string > & opennn::LanguageDataset::get_target_inverse_vocabulary_map ( ) const
inline

◆ get_target_vocabulary()

const vector< string > & opennn::LanguageDataset::get_target_vocabulary ( ) const
inline

◆ get_target_vocabulary_map()

const unordered_map< string, Index > & opennn::LanguageDataset::get_target_vocabulary_map ( ) const
inline

◆ get_target_vocabulary_size()

Index opennn::LanguageDataset::get_target_vocabulary_size ( ) const
inline

◆ read_txt()

void opennn::LanguageDataset::read_txt ( )

Reads the configured text corpus into the dataset and builds vocabularies.

◆ set_input_vocabulary()

void opennn::LanguageDataset::set_input_vocabulary ( const vector< string > & )

Replaces the input vocabulary and rebuilds its lookup map.

◆ set_maximum_vocabulary_size()

void opennn::LanguageDataset::set_maximum_vocabulary_size ( Index new_maximum)
inline

◆ set_minimum_token_frequency()

void opennn::LanguageDataset::set_minimum_token_frequency ( Index new_minimum)
inline

◆ set_target_vocabulary()

void opennn::LanguageDataset::set_target_vocabulary ( const vector< string > & )

Replaces the target vocabulary and rebuilds its lookup maps.

◆ to_JSON()

void opennn::LanguageDataset::to_JSON ( JsonWriter & ) const
overridevirtual

Writes dataset state to a JSON writer.

Reimplemented from opennn::Dataset.

Member Data Documentation

◆ END_INDEX

const float opennn::LanguageDataset::END_INDEX = 3.0f
inlinestatic

◆ END_TOKEN

const string opennn::LanguageDataset::END_TOKEN = "[END]"
inlinestatic

◆ PAD_TOKEN

const string opennn::LanguageDataset::PAD_TOKEN = "[PAD]"
inlinestatic

◆ reserved_tokens

const vector<string> opennn::LanguageDataset::reserved_tokens = {PAD_TOKEN, UNK_TOKEN, START_TOKEN, END_TOKEN}
inlinestatic

◆ START_INDEX

const float opennn::LanguageDataset::START_INDEX = 2.0f
inlinestatic

◆ START_TOKEN

const string opennn::LanguageDataset::START_TOKEN = "[START]"
inlinestatic

◆ UNK_INDEX

const float opennn::LanguageDataset::UNK_INDEX = 1.0f
inlinestatic

◆ UNK_TOKEN

const string opennn::LanguageDataset::UNK_TOKEN = "[UNK]"
inlinestatic