|
| | LanguageDataset (const filesystem::path &path="") |
| | Constructs a LanguageDataset, optionally loading from CSV.
|
| |
| | LanguageDataset (const Index samples_number, Index maximum_input_sequence_length, Index maximum_target_sequence_length) |
| | Constructs an empty LanguageDataset of given dimensions.
|
| |
| const vector< string > & | get_input_vocabulary () const |
| | Read-only access to the input-side vocabulary.
|
| |
| const vector< string > & | get_target_vocabulary () const |
| | Read-only access to the target-side vocabulary.
|
| |
| Index | get_input_vocabulary_size () const |
| | Number of distinct tokens in the input vocabulary.
|
| |
| Index | get_target_vocabulary_size () const |
| | Number of distinct tokens in the target vocabulary.
|
| |
| Index | get_maximum_input_sequence_length () const |
| | Maximum input sequence length supported.
|
| |
| Index | get_maximum_target_sequence_length () const |
| | Maximum target sequence length supported.
|
| |
| void | set_input_vocabulary (const vector< string > &new_vocabulary) |
| | Replaces the input vocabulary.
|
| |
| void | set_target_vocabulary (const vector< string > &new_vocabulary) |
| | Replaces the target vocabulary.
|
| |
| void | read_csv () override |
| | Reads tokens from the configured CSV file into the dataset.
|
| |
| void | create_vocabulary (const vector< vector< string > > &, vector< string > &) const |
| | Builds a vocabulary from a list of tokenized samples.
|
| |
| void | encode_input (const vector< vector< string > > &) |
| | Encodes input tokens into ids and stores them in the dataset.
|
| |
| void | encode_decoder_target_sequence_to_sequence (const vector< vector< string > > &) |
| | Encodes decoder target tokens for sequence-to-sequence training.
|
| |
| void | encode_target_classification (const vector< vector< string > > &) |
| | Encodes target tokens for classification training.
|
| |
| void | from_JSON (const JsonDocument &) override |
| | Loads dataset metadata and vocabularies from a parsed JSON document.
|
| |
| void | to_JSON (JsonWriter &) const override |
| | Writes dataset metadata and vocabularies to a streaming JSON writer.
|
| |
| | Dataset (const Index samples_number=0, const Shape &input_shape={0}, const Shape &target_shape={0}) |
| | Constructs an empty dataset of given dimensions.
|
| |
| | Dataset (const filesystem::path &data_path, const string &separator, bool has_header=true, bool has_ids=false, const Codification &codification=Codification::UTF8) |
| | Constructs a dataset by loading a delimited text file.
|
| |
| Index | get_samples_number () const |
| | Returns the total number of samples (rows of data).
|
| |
| Index | get_samples_number (const string &role_name) const |
| | Returns the number of samples assigned to a given role.
|
| |
| Index | get_used_samples_number () const |
| | Returns the number of samples that are not "None".
|
| |
| vector< Index > | get_sample_indices (const string &role_name) const |
| | Returns the indices of the samples assigned to a given role.
|
| |
| vector< Index > | get_used_sample_indices () const |
| | Returns the indices of all samples that are not "None".
|
| |
| const vector< SampleRole > & | get_sample_roles () const |
| | Returns the per-sample role assignments.
|
| |
| vector< Index > | get_sample_roles_vector () const |
| | Returns the per-sample role indices as plain integers.
|
| |
| VectorI | get_sample_role_numbers () const |
| | Counts the samples assigned to each role.
|
| |
| Index | get_variables_number () const |
| | Returns the total number of variables (columns of data).
|
| |
| Index | get_variables_number (const string &role_name) const |
| | Returns the number of variables assigned to a given role.
|
| |
| Index | get_used_variables_number () const |
| | Returns the number of variables that are not "Unused".
|
| |
| const vector< Variable > & | get_variables () const |
| | Returns the per-variable metadata.
|
| |
| vector< Variable > | get_variables (const string &role_name) const |
| | Returns the variables assigned to a given role.
|
| |
| Index | get_variable_index (const string &name) const |
| | Returns the column index of the variable with a given name.
|
| |
| Index | get_variable_index (const Index id) const |
| | Returns the column index of the variable with a given numeric id.
|
| |
| vector< Index > | get_variable_indices (const string &role_name) const |
| | Returns the column indices of the variables assigned to a given role.
|
| |
| vector< Index > | get_used_variables_indices () const |
| | Returns the column indices of all variables that are not "Unused".
|
| |
| vector< string > | get_variable_names () const |
| | Returns the names of every variable.
|
| |
| vector< string > | get_variable_names (const string &role_name) const |
| | Returns the names of the variables assigned to a given role.
|
| |
| VariableType | get_variable_type (const Index index) const |
| | Returns the type of a variable (Numeric, Binary, Categorical, ...).
|
| |
| vector< VariableType > | get_variable_types (const vector< Index > indices) const |
| | Returns the types of a list of variables.
|
| |
| Index | get_features_number () const |
| | Returns the total number of features.
|
| |
| Index | get_features_number (const string &role_name) const |
| | Returns the number of features assigned to a given role.
|
| |
| Index | get_used_features_number () const |
| | Returns the number of features that are not "Unused".
|
| |
| vector< string > | get_feature_names () const |
| | Returns the names of every feature.
|
| |
| vector< string > | get_feature_names (const string &role_name) const |
| | Returns the names of the features assigned to a given role.
|
| |
| vector< vector< Index > > | get_feature_indices () const |
| | Returns the per-variable feature indices.
|
| |
| vector< Index > | get_feature_indices (const Index variable_index) const |
| | Returns the feature indices for a single variable.
|
| |
| vector< Index > | get_feature_indices (const string &role_name) const |
| | Returns the feature indices for variables of a given role.
|
| |
| vector< Index > | get_used_feature_indices () const |
| | Returns the feature indices for all variables that are not "Unused".
|
| |
| vector< Index > | get_feature_dimensions () const |
| | Returns the per-variable feature dimension (1 for Numeric, N for Categorical).
|
| |
| Shape | get_shape (const string &role_name) const |
| | Returns the input or target shape used by the network.
|
| |
| vector< string > | get_feature_scalers (const string &role_name) const |
| | Returns the scaler chosen for each variable of a given role.
|
| |
| virtual void | get_batches (const vector< Index > &sample_indices, Index batch_size, bool shuffle, vector< vector< Index > > &batches) const |
| | Splits a list of sample indices into batches.
|
| |
| const MatrixR & | get_data () const |
| | Returns the raw data matrix.
|
| |
| MatrixR | get_feature_data (const string &role_name) const |
| | Returns the data matrix restricted to the features of a given role.
|
| |
| MatrixR | get_data (const string &sample_role, const string &variable_role) const |
| | Returns the data restricted to a sample-role and variable-role intersection.
|
| |
| MatrixR | get_data_from_indices (const vector< Index > &sample_indices, const vector< Index > &variable_indices) const |
| | Returns the data restricted to specific samples and variables.
|
| |
| VectorR | get_sample_data (const Index sample_index) const |
| | Returns a single sample as a row vector.
|
| |
| MatrixR | get_variable_data (const Index variable_index) const |
| | Returns the data for a single variable across all samples.
|
| |
| MatrixR | get_variable_data (const Index variable_index, const vector< Index > &sample_indices) const |
| | Returns the data for a single variable on a subset of samples.
|
| |
| MatrixR | get_variable_data (const string &variable_name) const |
| | Returns the data for a single variable identified by name.
|
| |
| const vector< vector< string > > & | get_data_file_preview () const |
| | Returns the cached preview of the source file (first rows).
|
| |
| MissingValuesMethod | get_missing_values_method () const |
| | Returns the configured missing-value strategy.
|
| |
| string | get_missing_values_method_string () const |
| | Returns the missing-value strategy as a string.
|
| |
| const filesystem::path & | get_data_path () const |
| | Returns the path to the source data file.
|
| |
| const Separator & | get_separator () const |
| | Returns the configured field separator.
|
| |
| string | get_separator_string () const |
| | Returns the field separator as the actual delimiter character(s).
|
| |
| string | get_separator_name () const |
| | Returns the field separator as a human-readable name.
|
| |
| const Codification & | get_codification () const |
| | Returns the configured source-file codification.
|
| |
| const string | get_codification_string () const |
| | Returns the codification as a string.
|
| |
| const string & | get_missing_values_label () const |
| | Returns the label that marks missing values in the source file.
|
| |
| bool | get_display () const |
| | Reports whether progress messages are printed.
|
| |
| bool | is_empty () const |
| | Reports whether the data matrix is empty.
|
| |
| Shape | get_input_shape () const |
| | Returns the input shape.
|
| |
| Shape | get_target_shape () const |
| | Returns the target shape.
|
| |
| void | set (const Index samples_number=0, const Shape &input_shape={}, const Shape &target_shape={}) |
| | Resets the dataset to a synthetic shape.
|
| |
| void | set (const filesystem::path &data_path, const string &separator, bool has_header=true, bool has_ids=false, const Dataset::Codification &codification=Codification::UTF8) |
| | Resets the dataset by loading a delimited text file.
|
| |
| void | set (const filesystem::path &file_name) |
| | Resets the dataset by loading a previously serialized JSON state.
|
| |
| void | set_default () |
| | Resets configuration members to defaults.
|
| |
| void | set_sample_roles (const string &role_name) |
| | Assigns the same role to every sample.
|
| |
| void | set_sample_role (const Index sample_index, const string &role_name) |
| | Assigns a role to a single sample.
|
| |
| void | set_sample_roles (const vector< string > &role_names) |
| | Assigns roles to all samples from a parallel string vector.
|
| |
| void | set_sample_roles (const vector< Index > &sample_indices, const string &role_name) |
| | Assigns the same role to a list of samples.
|
| |
| void | set_variables (const vector< Variable > &new_variables) |
| | Replaces the per-variable metadata.
|
| |
| void | set_default_variable_names () |
| | Sets default names ("variable_1", "variable_2", ...) for every variable.
|
| |
| virtual void | set_variable_roles (const vector< string > &role_names) |
| | Assigns roles to all variables from a parallel string vector.
|
| |
| void | set_variables (const string &description) |
| | Re-creates the variables vector from an input/target shape descriptor.
|
| |
| void | set_variable_indices (const vector< Index > &input_indices, const vector< Index > &target_indices) |
| | Marks selected variables as Input and others as Target.
|
| |
| void | set_input_variables_unused () |
| | Marks all input variables as Unused.
|
| |
| void | set_variable_role (const Index variable_index, const string &role_name) |
| | Sets the role of a single variable by index.
|
| |
| void | set_variable_role (const string &variable_name, const string &role_name) |
| | Sets the role of a single variable by name.
|
| |
| void | set_variable_type (const Index variable_index, const VariableType &type) |
| | Sets the type of a single variable by index.
|
| |
| void | set_variable_type (const string &variable_name, const VariableType &type) |
| | Sets the type of a single variable by name.
|
| |
| void | set_variable_types (const VariableType &type) |
| | Sets every variable to a given type.
|
| |
| void | set_variable_names (const vector< string > &new_variable_names) |
| | Replaces the names of every variable.
|
| |
| void | set_variables_number (const Index new_size) |
| | Resizes the variables vector.
|
| |
| void | set_variable_scalers (const string &scaler_name) |
| | Sets the same scaler on every variable.
|
| |
| void | set_variable_scalers (const vector< string > &scaler_names) |
| | Sets one scaler per variable.
|
| |
| void | set_binary_variables () |
| | Detects binary variables (two distinct values) and tags them accordingly.
|
| |
| void | set_feature_names (const vector< string > &new_feature_names) |
| | Names every feature.
|
| |
| void | set_variable_roles (const string &role_name) |
| | Assigns the same role to every variable.
|
| |
| void | set_shape (const string &role_name, const Shape &new_shape) |
| | Sets the input or target shape.
|
| |
| void | set_data (const MatrixR &new_data) |
| | Replaces the data matrix.
|
| |
| void | set_data_path (const filesystem::path &new_data_path) |
| | Sets the path to the source data file.
|
| |
| void | set_has_header (bool new_has_header) |
| | Sets whether the source file has a header row.
|
| |
| void | set_has_ids (bool new_has_ids) |
| | Sets whether the source file has a sample-id column.
|
| |
| void | set_separator (const Separator &new_separator) |
| | Sets the field separator.
|
| |
| void | set_separator_string (const string &new_separator_string) |
| | Sets the field separator from its delimiter character(s).
|
| |
| void | set_separator_name (const string &new_separator_name) |
| | Sets the field separator from its human-readable name.
|
| |
| void | set_codification (const Codification &new_codification) |
| | Sets the source-file codification.
|
| |
| void | set_codification (const string &new_codification) |
| | Sets the source-file codification from its name.
|
| |
| void | set_missing_values_label (string label) |
| | Sets the label used for missing values in the source file.
|
| |
| void | set_missing_values_method (const MissingValuesMethod &method) |
| | Sets the missing-value handling strategy.
|
| |
| void | set_missing_values_method (const string &method_name) |
| | Sets the missing-value handling strategy from its name.
|
| |
| void | set_gmt (const Index new_gmt) |
| | Sets the GMT offset for time variables.
|
| |
| void | set_display (bool new_display) |
| | Toggles progress messages.
|
| |
| bool | is_sample_used (const Index i) const |
| | Reports whether a sample is used (any role other than None).
|
| |
| bool | has_binary_variables () const |
| | Reports whether at least one variable is binary.
|
| |
| bool | has_categorical_variables () const |
| | Reports whether at least one variable is categorical.
|
| |
| bool | has_binary_or_categorical_variables () const |
| | Reports whether the dataset has any binary or categorical variable.
|
| |
| bool | has_time_variable () const |
| | Reports whether at least one variable plays the Time role.
|
| |
| bool | has_validation () const |
| | Reports whether at least one sample is assigned to Validation.
|
| |
| bool | has_missing_values (const vector< string > &labels) const |
| | Reports whether the dataset has missing values matching any of the supplied labels.
|
| |
| void | split_samples (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f, bool shuffle=true) |
| | Splits samples into training/validation/testing partitions.
|
| |
| void | split_samples_sequential (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f) |
| | Splits samples into partitions in their original order.
|
| |
| void | split_samples_random (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f) |
| | Splits samples into partitions after random shuffling.
|
| |
| vector< string > | unuse_uncorrelated_variables (const float minimum_correlation=0.25f) |
| | Marks variables with low correlation against the target as Unused.
|
| |
| vector< string > | unuse_collinear_variables (const float maximum_correlation=0.95f) |
| | Marks variables strongly correlated against another input as Unused.
|
| |
| void | set_data_constant (const float value) |
| | Fills the data matrix with a constant value.
|
| |
| vector< Descriptives > | calculate_feature_descriptives () const |
| | Computes descriptive statistics for every feature.
|
| |
| vector< Descriptives > | calculate_variable_descriptives_positive_samples () const |
| | Computes descriptives for inputs restricted to positive-target samples.
|
| |
| vector< Descriptives > | calculate_variable_descriptives_negative_samples () const |
| | Computes descriptives for inputs restricted to negative-target samples.
|
| |
| vector< Descriptives > | calculate_variable_descriptives_categories (const Index variable_index) const |
| | Computes descriptives per category of a categorical variable.
|
| |
| vector< Descriptives > | calculate_feature_descriptives (const string &role_name) const |
| | Computes feature descriptives for a single role.
|
| |
| vector< Histogram > | calculate_variable_distributions (const Index bins_number=10) const |
| | Builds histograms of every variable.
|
| |
| vector< BoxPlot > | calculate_variables_box_plots () const |
| | Computes box-plot statistics for every variable.
|
| |
| Tensor< Correlation, 2 > | calculate_input_variable_correlations (Correlation(*correlation_function)(const MatrixR &, const MatrixR &), Correlation::Method method, const string &samples_role) const |
| | Computes a custom correlation between every pair of input variables.
|
| |
| Tensor< Correlation, 2 > | calculate_input_variable_pearson_correlations () const |
| | Computes Pearson correlations between every pair of input variables.
|
| |
| Tensor< Correlation, 2 > | calculate_input_variable_spearman_correlations () const |
| | Computes Spearman rank correlations between every pair of input variables.
|
| |
| Tensor< Correlation, 2 > | calculate_input_target_variable_correlations (Correlation(*correlation_function)(const MatrixR &, const MatrixR &), const string &samples_role) const |
| | Computes a custom correlation between inputs and targets.
|
| |
| Tensor< Correlation, 2 > | calculate_input_target_variable_pearson_correlations () const |
| | Computes Pearson correlations between inputs and targets.
|
| |
| Tensor< Correlation, 2 > | calculate_input_target_variable_spearman_correlations () const |
| | Computes Spearman rank correlations between inputs and targets.
|
| |
| VectorI | calculate_correlations_rank () const |
| | Returns the rank of every input variable by absolute Pearson correlation against targets.
|
| |
| void | set_default_variable_scalers () |
| | Picks default scalers for every variable based on its type.
|
| |
| vector< Descriptives > | scale_data () |
| | Scales the entire data matrix.
|
| |
| virtual vector< Descriptives > | scale_features (const string &role_name) |
| | Scales the features of a given role.
|
| |
| void | unscale_features (const string &role_name, const vector< Descriptives > &feature_descriptives) |
| | Inverse-scales the features of a given role.
|
| |
| VectorI | calculate_target_distribution () const |
| | Counts the samples of every target class.
|
| |
| vector< vector< Index > > | calculate_Tukey_outliers (const float tukey_factor=1.5f, bool replace=false) |
| | Detects Tukey outliers per variable.
|
| |
| vector< vector< Index > > | replace_Tukey_outliers_with_NaN (const float tukey_factor=1.5f) |
| | Detects Tukey outliers and replaces them with NaN.
|
| |
| void | unuse_Tukey_outliers (const float tukey_factor=1.5f) |
| | Marks Tukey-outlier samples as Unused.
|
| |
| virtual void | set_data_random () |
| | Fills the data matrix with uniform random values.
|
| |
| virtual void | set_data_integer (const Index vocabulary_size) |
| | Fills the data matrix with random integers in [0, vocabulary_size).
|
| |
| void | set_data_rosenbrock () |
| | Fills the data matrix with samples from the Rosenbrock function.
|
| |
| void | set_data_binary_classification () |
| | Fills the data matrix with synthetic binary-classification data.
|
| |
| void | save (const filesystem::path &file_name) const |
| | Saves the dataset state to a JSON file on disk.
|
| |
| void | load (const filesystem::path &file_name) |
| | Loads the dataset state from a JSON file on disk.
|
| |
| void | save_data () const |
| | Saves the data matrix back to the configured source file.
|
| |
| void | save_data_binary (const filesystem::path &file_name) const |
| | Saves the data matrix as a binary file.
|
| |
| void | load_data_binary () |
| | Loads the data matrix from a binary file produced by save_data_binary().
|
| |
| Index | get_missing_values_number () const |
| | Returns the total number of cells flagged as missing.
|
| |
| bool | has_nan () const |
| | Reports whether the data matrix contains any NaN.
|
| |
| bool | has_nan_row (const Index row_index) const |
| | Reports whether a row contains any NaN.
|
| |
| virtual void | impute_missing_values_unuse () |
| | Marks samples with missing values as Unused.
|
| |
| void | impute_missing_values_statistic (const MissingValuesMethod &method) |
| | Replaces missing values with a per-variable statistic.
|
| |
| virtual void | impute_missing_values_interpolate () |
| | Replaces missing values via linear interpolation along each variable.
|
| |
| void | scrub_missing_values () |
| | Removes samples that contain missing values.
|
| |
| void | calculate_missing_values_statistics () |
| | Updates the cached missing-value statistics (counts, indices).
|
| |
| VectorI | count_nans_per_variable () const |
| | Counts NaN cells per variable.
|
| |
| Index | count_variables_with_nan () const |
| | Counts variables that contain at least one NaN.
|
| |
| Index | count_rows_with_nan () const |
| | Counts samples that contain at least one NaN.
|
| |
| Index | count_nan () const |
| | Counts the total number of NaN cells.
|
| |
| vector< vector< Index > > | split_samples (const vector< Index > &indices, Index parts_number) const |
| | Splits a list of sample indices into chunks of (roughly) equal size.
|
| |
| DateFormat | infer_dataset_date_format (const vector< Variable > &variables, const vector< vector< string > > &data_file_preview, bool has_header, const string &missing_values_label) |
| | Infers the date format used by date-typed variables in the source file.
|
| |
| virtual void | fill_inputs (const vector< Index > &sample_indices, const vector< Index > &feature_indices, float *buffer, bool transpose=true, int contiguous=-1) const |
| | Fills a contiguous float buffer with input data for a batch of samples.
|
| |
| virtual void | augment_inputs (float *buffer, Index batch_size) const |
| | Optionally augments inputs in-place after fill_inputs() (e.g. random crops).
|
| |
| virtual void | fill_decoder (const vector< Index > &sample_indices, const vector< Index > &feature_indices, float *buffer, bool transpose=true, int contiguous=-1) const |
| | Fills a contiguous buffer with decoder-side inputs (transformer-style models).
|
| |
| virtual void | fill_targets (const vector< Index > &sample_indices, const vector< Index > &feature_indices, float *buffer, bool transpose=true, int contiguous=-1) const |
| | Fills a contiguous buffer with target data for a batch of samples.
|
| |