Token-based language dataset with input/target vocabularies and binary token cache.
More...
|
| | LanguageDataset (const filesystem::path &="") |
| | Creates a language dataset, optionally reading documents from the given path.
|
| |
| | LanguageDataset (const filesystem::path &, Index maximum_vocabulary_size) |
| | Creates a language dataset capped at the given vocabulary size.
|
| |
| | LanguageDataset (const filesystem::path &, Index maximum_vocabulary_size, Index minimum_token_frequency) |
| | Creates a language dataset with vocabulary size and minimum token frequency filters.
|
| |
| | LanguageDataset (const Index, Index, Index) |
| | Creates a synthetic language dataset with the given sample count and vocabulary controls.
|
| |
| const vector< string > & | get_input_vocabulary () const |
| |
| const vector< string > & | get_target_vocabulary () const |
| |
| Index | get_input_vocabulary_size () const |
| |
| Index | get_target_vocabulary_size () const |
| |
| const unordered_map< string, Index > & | get_input_vocabulary_map () const |
| |
| const unordered_map< string, Index > & | get_target_vocabulary_map () const |
| |
| const unordered_map< Index, string > & | get_target_inverse_vocabulary_map () const |
| |
| Index | get_maximum_input_sequence_length () const |
| |
| Index | get_maximum_target_sequence_length () const |
| |
| void | set_input_vocabulary (const vector< string > &) |
| | Replaces the input vocabulary and rebuilds its lookup map.
|
| |
| void | set_target_vocabulary (const vector< string > &) |
| | Replaces the target vocabulary and rebuilds its lookup maps.
|
| |
| void | set_maximum_vocabulary_size (Index new_maximum) |
| |
| void | set_minimum_token_frequency (Index new_minimum) |
| |
| Index | get_samples_number () const override |
| | Returns the number of token sequences in the dataset.
|
| |
| VectorI | calculate_target_distribution () const override |
| | Returns the distribution of target tokens or classes.
|
| |
| void | read_txt () |
| | Reads the configured text corpus into the dataset and builds vocabularies.
|
| |
| void | create_vocabulary (const vector< vector< string_view > > &, vector< string > &) const |
| | Builds a vocabulary from the tokenized documents, filtered by frequency and size limits.
|
| |
| void | from_JSON (const JsonDocument &) override |
| | Loads dataset state from a JSON document.
|
| |
| void | to_JSON (JsonWriter &) const override |
| | Writes dataset state to a JSON writer.
|
| |
| void | fill_inputs (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override |
| | Streams input token indices of the selected samples into the destination buffer.
|
| |
| void | fill_targets (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override |
| | Streams target token indices of the selected samples into the destination buffer.
|
| |
| void | fill_decoder (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override |
| | Streams decoder token indices (target shifted right) into the destination buffer.
|
| |
| Index | get_samples_number (const string &) const |
| | Returns the number of samples with the given role ("Training", "Validation", ...).
|
| |
| virtual | ~Dataset ()=default |
| |
| Index | get_samples_number (const string &) const |
| | Returns the number of samples with the given role ("Training", "Validation", ...).
|
| |
| Index | get_used_samples_number () const |
| | Returns the number of samples whose role is not None.
|
| |
| vector< Index > | get_sample_indices (const string &) const |
| | Returns indices of samples with the given role name.
|
| |
| vector< Index > | get_used_sample_indices () const |
| | Returns indices of all samples whose role is not None.
|
| |
| const vector< SampleRole > & | get_sample_roles () const |
| | Returns the per-sample role vector.
|
| |
| vector< Index > | get_sample_roles_vector () const |
| | Returns the per-sample roles as integer indices.
|
| |
| VectorI | get_sample_role_numbers () const |
| | Returns the per-sample roles as a tensor of integer indices.
|
| |
| Index | get_variables_number () const |
| | Returns the total number of variables (columns descriptors).
|
| |
| Index | get_variables_number (const string &) const |
| | Returns the number of variables with the given role name.
|
| |
| Index | get_used_variables_number () const |
| | Returns the number of variables whose role is in use.
|
| |
| const vector< Variable > & | get_variables () const |
| | Returns the variable descriptors.
|
| |
| vector< Variable > | get_variables (const string &) const |
| | Returns the variables with the given role name.
|
| |
| Index | get_variable_index (const string &) const |
| | Returns the index of the variable with the given name.
|
| |
| Index | get_variable_index (const Index) const |
| | Returns the variable index corresponding to a flat feature index.
|
| |
| vector< Index > | get_variable_indices (const string &) const |
| | Returns the indices of variables matching the given role name.
|
| |
| vector< Index > | get_used_variables_indices () const |
| | Returns the indices of all in-use variables.
|
| |
| vector< string > | get_variable_names () const |
| | Returns the names of all variables.
|
| |
| vector< string > | get_variable_names (const string &) const |
| | Returns the names of variables with the given role name.
|
| |
| VariableType | get_variable_type (const Index index) const |
| | Returns the VariableType of the variable at the given index.
|
| |
| vector< VariableType > | get_variable_types (const vector< Index > &indices) const |
| | Returns the VariableType for each of the given variable indices.
|
| |
| Index | get_features_number () const |
| | Returns the total number of features (data matrix columns).
|
| |
| Index | get_features_number (const string &) const |
| | Returns the number of features for variables with the given role name.
|
| |
| Index | get_used_features_number () const |
| | Returns the number of features for in-use variables.
|
| |
| vector< string > | get_feature_names () const |
| | Returns the expanded feature names (one entry per data matrix column).
|
| |
| vector< string > | get_feature_names (const string &) const |
| | Returns the expanded feature names restricted to the given role.
|
| |
| vector< vector< Index > > | get_feature_indices () const |
| | Returns the data column indices grouped per variable.
|
| |
| vector< Index > | get_feature_indices (const Index) const |
| | Returns the data column indices that belong to the given variable index.
|
| |
| vector< Index > | get_feature_indices (const string &) const |
| | Returns the data column indices that belong to variables with the given role name.
|
| |
| vector< Index > | get_used_feature_indices () const |
| | Returns the data column indices that belong to in-use variables.
|
| |
| vector< Index > | get_feature_dimensions () const |
| | Returns the per-variable feature dimension counts.
|
| |
| Shape | get_shape (const string &) const |
| | Returns the configured Shape for the given role ("Input", "Target", "Decoder").
|
| |
| virtual void | get_batches (const vector< Index > &, Index, bool, vector< vector< Index > > &) const |
| | Splits sample indices into batches and writes them into batches.
|
| |
| const vector< vector< string > > & | get_data_file_preview () const |
| | Returns the parsed preview rows captured during the last file read.
|
| |
| const filesystem::path & | get_data_path () const |
| | Returns the configured data file path.
|
| |
| const Separator & | get_separator () const |
| | Returns the current field Separator.
|
| |
| string | get_separator_string () const |
| | Returns the separator as the literal character used in files.
|
| |
| string | get_separator_name () const |
| | Returns the separator as its enumerator name ("Space", "Tab", ...).
|
| |
| const Codification & | get_codification () const |
| | Returns the configured text Codification.
|
| |
| string | get_codification_string () const |
| | Returns the codification as its enumerator name.
|
| |
| bool | get_display () const |
| | Returns whether progress messages are printed.
|
| |
| virtual bool | is_empty () const |
| | Returns true when the dataset contains no samples.
|
| |
| Shape | get_input_shape () const |
| | Returns the configured input tensor shape.
|
| |
| Shape | get_target_shape () const |
| | Returns the configured target tensor shape.
|
| |
| const MatrixR & | get_data () const |
| | Returns the raw data matrix (rows = samples, columns = features).
|
| |
| void | set_data (const MatrixR &) |
| | Replaces the underlying data matrix.
|
| |
| void | set_data_constant (const float) |
| | Fills the data matrix with the given constant value.
|
| |
| void | set_default () |
| | Restores default dataset settings.
|
| |
| void | set_sample_roles (const string &) |
| | Assigns the same role to every sample.
|
| |
| void | set_sample_role (const Index, const string &) |
| | Sets the role of a single sample by index.
|
| |
| void | set_sample_roles (const vector< string > &) |
| | Assigns a role per sample from a string vector.
|
| |
| void | set_sample_roles (const vector< Index > &, const string &) |
| | Assigns the same role to the given sample indices.
|
| |
| void | set_variables (const vector< Variable > &new_variables) |
| |
| void | set_default_variable_names () |
| | Assigns default placeholder names to all variables.
|
| |
| void | set_variable_roles (const vector< string > &) |
| | Sets the role of every variable from the given string list.
|
| |
| void | set_variable_indices (const vector< Index > &, const vector< Index > &) |
| | Sets which variable indices are inputs and which are targets.
|
| |
| void | set_input_variables_unused () |
| | Marks all input variables as unused (role None).
|
| |
| void | set_variable_role (const Index, const string &) |
| | Sets the role of the variable at the given index.
|
| |
| void | set_variable_role (const string &, const string &) |
| | Sets the role of the variable with the given name.
|
| |
| void | set_variable_type (const Index, const VariableType &) |
| | Sets the type of the variable at the given index.
|
| |
| void | set_variable_type (const string &, const VariableType &) |
| | Sets the type of the variable with the given name.
|
| |
| void | set_variable_types (const VariableType &) |
| | Sets every variable to the given type.
|
| |
| void | set_binary_variables () |
| | Detects and marks variables with binary values as VariableType::Binary.
|
| |
| void | set_variable_names (const vector< string > &) |
| | Assigns names to all variables from the given list.
|
| |
| void | set_variables_number (const Index new_size) |
| |
| void | set_feature_names (const vector< string > &) |
| | Assigns expanded feature names, propagating categories back to variables.
|
| |
| void | set_variable_roles (const string &) |
| | Assigns the same role to every variable.
|
| |
| void | set_shape (const string &, const Shape &) |
| | Sets the tensor Shape associated with the given role ("Input"/"Target"/"Decoder").
|
| |
| virtual void | resize_input_shape (Index input_features_count) |
| | Resizes the input shape to the given flat feature count.
|
| |
| void | set_data_path (const filesystem::path &new_data_path) |
| |
| void | set_has_header (bool new_has_header) |
| |
| void | set_has_ids (bool new_has_ids) |
| |
| void | set_separator (const Separator &new_separator) |
| |
| void | set_separator_string (const string &) |
| | Sets the separator from its literal character.
|
| |
| void | set_separator_name (const string &) |
| | Sets the separator from its enumerator name.
|
| |
| void | set_codification (const Codification &new_codification) |
| |
| void | set_codification (const string &) |
| | Sets the codification from its enumerator name.
|
| |
| void | set_display (bool new_display) |
| |
| bool | is_sample_used (const Index i) const |
| | Returns true if the sample at i has a role other than None.
|
| |
| bool | has_binary_variables () const |
| | Returns true if any variable has type Binary.
|
| |
| bool | has_categorical_variables () const |
| | Returns true if any variable has type Categorical.
|
| |
| bool | has_binary_or_categorical_variables () const |
| | Returns true if any variable is Binary or Categorical.
|
| |
| bool | has_time_variable () const |
| | Returns true if any variable has role Time.
|
| |
| bool | has_validation () const |
| | Returns true if any sample is assigned the Validation role.
|
| |
| void | split_samples (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f, bool shuffle=true) |
| | Splits samples into Training/Validation/Testing roles, optionally shuffled.
|
| |
| void | split_samples_sequential (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f) |
| | Splits samples sequentially without shuffling.
|
| |
| void | split_samples_random (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f) |
| | Splits samples randomly across roles.
|
| |
| vector< vector< Index > > | split_samples (const vector< Index > &, Index) const |
| | Splits the given indices into chunks of the requested size.
|
| |
| virtual vector< Descriptives > | scale_features (const string &) |
| | Scales the features with the configured method; returns the applied descriptives.
|
| |
| virtual void | set_data_random () |
| | Fills the data matrix with random values (no-op in base class).
|
| |
| virtual void | set_data_integer (const Index) |
| | Fills the data matrix with random integers up to the given vocabulary size.
|
| |
| MatrixR | get_data (const string &, const string &) const |
| | Returns the data submatrix for the given sample role and feature role.
|
| |
| MatrixR | get_data_from_indices (const vector< Index > &, const vector< Index > &) const |
| | Returns a submatrix from the data using explicit row and column indices.
|
| |
| VectorR | get_sample_data (const Index) const |
| | Returns the row of the data matrix at the given sample index.
|
| |
| MatrixR | get_variable_data (const Index) const |
| | Returns the data columns belonging to the variable at index.
|
| |
| MatrixR | get_variable_data (const Index, const vector< Index > &) const |
| | Returns the variable data restricted to the given sample indices.
|
| |
| MatrixR | get_variable_data (const string &) const |
| | Returns the data columns of the variable with the given name.
|
| |
| MatrixR | get_feature_data (const string &) const |
| | Returns the data columns belonging to features with the given role name.
|
| |
| void | set (const Index=0, const Shape &={}, const Shape &={}) |
| | Resets the dataset with the given sample count and input/target shapes.
|
| |
| bool | has_nan () const |
| | Returns true if any entry in the data matrix is NaN.
|
| |
| bool | has_nan_row (const Index) const |
| | Returns true if the sample at the given row contains a NaN.
|
| |
| VectorI | count_nans_per_variable () const |
| | Returns the NaN count for each variable.
|
| |
| Index | count_variables_with_nan () const |
| | Returns the number of variables that contain at least one NaN.
|
| |
| Index | count_rows_with_nan () const |
| | Returns the number of rows that contain at least one NaN.
|
| |
| Index | count_nan () const |
| | Returns the total NaN count in the data matrix.
|
| |
| virtual void | scrub_missing_values () |
| | Removes or imputes missing values using the configured strategy.
|
| |
| virtual vector< Descriptives > | calculate_feature_descriptives (const string &) const |
| | Returns descriptive statistics for features with the given role (subclass-specific).
|
| |
| virtual Tensor< Correlation, 2 > | calculate_input_target_variable_pearson_correlations () const |
| | Returns the Pearson correlations between input and target variables.
|
| |
| virtual VectorI | calculate_correlations_rank () const |
| | Returns input variables ranked by absolute correlation with the target.
|
| |
| virtual void | unscale_features (const string &, const vector< Descriptives > &) |
| | Reverts a previously applied scaling using the supplied descriptives.
|
| |
| void | save (const filesystem::path &) const |
| | Saves the dataset metadata to a JSON file at the given path.
|
| |
| void | load (const filesystem::path &) |
| | Loads the dataset metadata from the JSON file at the given path.
|
| |
| void | save_data () const |
| | Saves the data matrix to the configured data path.
|
| |
| void | save_data_binary (const filesystem::path &) const |
| | Saves the data matrix to the given path in binary form.
|
| |
| void | load_data_binary () |
| | Loads the data matrix from the binary file at the configured data path.
|
| |
| virtual void | augment_inputs (float *, Index) const |
| | Applies data augmentation in place to the input buffer (no-op in base class).
|
| |