Tabular dataset with CSV loading, scaling, descriptive statistics and correlation analysis.
More...
|
| | TabularDataset (const Index=0, const Shape &={0}, const Shape &={0}) |
| | Creates a tabular dataset with the given sample count and input/target shapes.
|
| |
| | TabularDataset (const filesystem::path &, const string &, bool=true, bool=false, const Codification &=Codification::UTF8) |
| | Creates a tabular dataset by reading the given file with the given separator.
|
| |
| void | set (const filesystem::path &, const string &, bool=true, bool=false, const Codification &=Codification::UTF8) |
| | Resets the dataset from the given file using the given parsing options.
|
| |
| void | set (const filesystem::path &) |
| | Resets the dataset by reading from the given path with default parsing options.
|
| |
| vector< string > | get_feature_scalers (const string &) const |
| | Returns the scaler names for variables matching the given role.
|
| |
| void | set_variable_scalers (const string &) |
| | Sets every variable's scaler from the given name.
|
| |
| void | set_variable_scalers (const vector< string > &) |
| | Sets each variable's scaler from the corresponding name in the list.
|
| |
| void | set_default_variable_scalers () |
| | Assigns default scalers based on each variable's type.
|
| |
| void | set_gmt (const Index new_gmt) |
| |
| DateFormat | infer_dataset_date_format (const vector< Variable > &, const vector< vector< string_view > > &, bool, const string &) |
| | Infers the date/time format used in the given file preview rows.
|
| |
| MissingValuesMethod | get_missing_values_method () const |
| |
| string | get_missing_values_method_string () const |
| | Returns the missing-values method as its enumerator name.
|
| |
| const string & | get_missing_values_label () const |
| |
| Index | get_missing_values_number () const |
| |
| void | set_missing_values_label (string label) |
| |
| void | set_missing_values_method (const MissingValuesMethod &method) |
| |
| void | set_missing_values_method (const string &) |
| | Sets the missing-values method from its enumerator name.
|
| |
| bool | has_missing_values (const vector< string_view > &) const |
| | Returns true if any field in the row matches the missing-values label.
|
| |
| void | scrub_missing_values () override |
| | Removes or imputes missing values using the configured MissingValuesMethod.
|
| |
| void | calculate_missing_values_statistics () |
| | Refreshes per-variable and per-row missing-value counts.
|
| |
| void | impute_missing_values_statistic (const MissingValuesMethod &) |
| | Imputes missing values using the given statistic-based method (Mean/Median).
|
| |
| virtual void | impute_missing_values_unuse () |
| | Marks samples containing missing values as unused.
|
| |
| virtual void | impute_missing_values_interpolate () |
| | Fills missing values by interpolating between neighboring samples.
|
| |
| vector< string > | unuse_uncorrelated_variables (const float=0.25f) |
| | Marks input variables whose absolute correlation with targets is below the threshold as unused.
|
| |
| vector< string > | unuse_collinear_variables (const float=0.95f) |
| | Marks input variables that are highly collinear (above the threshold) as unused.
|
| |
| vector< Descriptives > | calculate_feature_descriptives () const |
| | Returns descriptive statistics for every feature across all samples.
|
| |
| vector< Descriptives > | calculate_feature_descriptives (const string &) const override |
| | Returns descriptive statistics for features with the given role name.
|
| |
| vector< Descriptives > | calculate_variable_descriptives_positive_samples () const |
| | Returns variable descriptives computed only over samples with positive target.
|
| |
| vector< Descriptives > | calculate_variable_descriptives_negative_samples () const |
| | Returns variable descriptives computed only over samples with negative target.
|
| |
| vector< Descriptives > | calculate_variable_descriptives_categories (const Index) const |
| | Returns variable descriptives restricted to the given target class index.
|
| |
| vector< Histogram > | calculate_variable_distributions (const Index=10) const |
| | Returns per-variable histograms with the given bin count.
|
| |
| vector< BoxPlot > | calculate_variables_box_plots () const |
| | Returns per-variable Tukey box-plot summaries.
|
| |
| Tensor< Correlation, 2 > | calculate_input_variable_correlations (Correlation(*)(const MatrixR &, const MatrixR &), Correlation::Method, const string &) const |
| | Returns the input-input correlation matrix using the given correlation function and method.
|
| |
| Tensor< Correlation, 2 > | calculate_input_variable_pearson_correlations () const |
| | Returns the Pearson correlation matrix among input variables.
|
| |
| Tensor< Correlation, 2 > | calculate_input_variable_spearman_correlations () const |
| | Returns the Spearman correlation matrix among input variables.
|
| |
| Tensor< Correlation, 2 > | calculate_input_target_variable_correlations (Correlation(*)(const MatrixR &, const MatrixR &), const string &) const |
| | Returns the input-target correlation matrix using the given correlation function.
|
| |
| Tensor< Correlation, 2 > | calculate_input_target_variable_pearson_correlations () const override |
| | Returns the Pearson correlation matrix between input and target variables.
|
| |
| Tensor< Correlation, 2 > | calculate_input_target_variable_spearman_correlations () const |
| | Returns the Spearman correlation matrix between input and target variables.
|
| |
| VectorI | calculate_correlations_rank () const override |
| | Returns input variable indices ranked by absolute correlation with the target.
|
| |
| vector< Descriptives > | scale_data () |
| | Scales every feature column using its configured scaler; returns the applied descriptives.
|
| |
| vector< Descriptives > | scale_features (const string &) override |
| | Scales the features with the given role using their configured scalers.
|
| |
| void | unscale_features (const string &, const vector< Descriptives > &) override |
| | Reverts the scaling of features with the given role using the supplied descriptives.
|
| |
| VectorI | calculate_target_distribution () const override |
| | Returns the distribution of target classes for classification datasets.
|
| |
| vector< vector< Index > > | calculate_Tukey_outliers (const float=1.5f, bool=false) |
| | Detects Tukey outliers per variable using the given fence multiplier.
|
| |
| vector< vector< Index > > | replace_Tukey_outliers_with_NaN (const float=1.5f) |
| | Replaces detected Tukey outliers with NaN and returns the affected indices.
|
| |
| void | unuse_Tukey_outliers (const float=1.5f) |
| | Marks samples containing Tukey outliers as unused.
|
| |
| void | set_data_random () override |
| | Fills the data matrix with random values.
|
| |
| void | set_data_integer (const Index vocabulary_size) override |
| | Fills the data matrix with random integers up to the given vocabulary size.
|
| |
| void | set_data_rosenbrock () |
| | Fills the data matrix with samples drawn from the Rosenbrock function (regression test data).
|
| |
| void | set_data_binary_classification () |
| | Fills the data matrix with a synthetic binary classification dataset.
|
| |
| void | from_JSON (const JsonDocument &) override |
| | Loads dataset state from a JSON document.
|
| |
| void | to_JSON (JsonWriter &) const override |
| | Writes dataset state to a JSON writer.
|
| |
| void | read_csv () |
| | Reads the configured CSV file into the dataset, inferring types and headers.
|
| |
| void | set (const Index=0, const Shape &={}, const Shape &={}) |
| | Resets the dataset with the given sample count and input/target shapes.
|
| |
| virtual | ~Dataset ()=default |
| |
| virtual Index | get_samples_number () const |
| | Returns the total number of samples (rows) in the data matrix.
|
| |
| Index | get_samples_number (const string &) const |
| | Returns the number of samples with the given role ("Training", "Validation", ...).
|
| |
| Index | get_used_samples_number () const |
| | Returns the number of samples whose role is not None.
|
| |
| vector< Index > | get_sample_indices (const string &) const |
| | Returns indices of samples with the given role name.
|
| |
| vector< Index > | get_used_sample_indices () const |
| | Returns indices of all samples whose role is not None.
|
| |
| const vector< SampleRole > & | get_sample_roles () const |
| | Returns the per-sample role vector.
|
| |
| vector< Index > | get_sample_roles_vector () const |
| | Returns the per-sample roles as integer indices.
|
| |
| VectorI | get_sample_role_numbers () const |
| | Returns the per-sample roles as a tensor of integer indices.
|
| |
| Index | get_variables_number () const |
| | Returns the total number of variables (columns descriptors).
|
| |
| Index | get_variables_number (const string &) const |
| | Returns the number of variables with the given role name.
|
| |
| Index | get_used_variables_number () const |
| | Returns the number of variables whose role is in use.
|
| |
| const vector< Variable > & | get_variables () const |
| | Returns the variable descriptors.
|
| |
| vector< Variable > | get_variables (const string &) const |
| | Returns the variables with the given role name.
|
| |
| Index | get_variable_index (const string &) const |
| | Returns the index of the variable with the given name.
|
| |
| Index | get_variable_index (const Index) const |
| | Returns the variable index corresponding to a flat feature index.
|
| |
| vector< Index > | get_variable_indices (const string &) const |
| | Returns the indices of variables matching the given role name.
|
| |
| vector< Index > | get_used_variables_indices () const |
| | Returns the indices of all in-use variables.
|
| |
| vector< string > | get_variable_names () const |
| | Returns the names of all variables.
|
| |
| vector< string > | get_variable_names (const string &) const |
| | Returns the names of variables with the given role name.
|
| |
| VariableType | get_variable_type (const Index index) const |
| | Returns the VariableType of the variable at the given index.
|
| |
| vector< VariableType > | get_variable_types (const vector< Index > &indices) const |
| | Returns the VariableType for each of the given variable indices.
|
| |
| Index | get_features_number () const |
| | Returns the total number of features (data matrix columns).
|
| |
| Index | get_features_number (const string &) const |
| | Returns the number of features for variables with the given role name.
|
| |
| Index | get_used_features_number () const |
| | Returns the number of features for in-use variables.
|
| |
| vector< string > | get_feature_names () const |
| | Returns the expanded feature names (one entry per data matrix column).
|
| |
| vector< string > | get_feature_names (const string &) const |
| | Returns the expanded feature names restricted to the given role.
|
| |
| vector< vector< Index > > | get_feature_indices () const |
| | Returns the data column indices grouped per variable.
|
| |
| vector< Index > | get_feature_indices (const Index) const |
| | Returns the data column indices that belong to the given variable index.
|
| |
| vector< Index > | get_feature_indices (const string &) const |
| | Returns the data column indices that belong to variables with the given role name.
|
| |
| vector< Index > | get_used_feature_indices () const |
| | Returns the data column indices that belong to in-use variables.
|
| |
| vector< Index > | get_feature_dimensions () const |
| | Returns the per-variable feature dimension counts.
|
| |
| Shape | get_shape (const string &) const |
| | Returns the configured Shape for the given role ("Input", "Target", "Decoder").
|
| |
| virtual void | get_batches (const vector< Index > &, Index, bool, vector< vector< Index > > &) const |
| | Splits sample indices into batches and writes them into batches.
|
| |
| const vector< vector< string > > & | get_data_file_preview () const |
| | Returns the parsed preview rows captured during the last file read.
|
| |
| const filesystem::path & | get_data_path () const |
| | Returns the configured data file path.
|
| |
| const Separator & | get_separator () const |
| | Returns the current field Separator.
|
| |
| string | get_separator_string () const |
| | Returns the separator as the literal character used in files.
|
| |
| string | get_separator_name () const |
| | Returns the separator as its enumerator name ("Space", "Tab", ...).
|
| |
| const Codification & | get_codification () const |
| | Returns the configured text Codification.
|
| |
| string | get_codification_string () const |
| | Returns the codification as its enumerator name.
|
| |
| bool | get_display () const |
| | Returns whether progress messages are printed.
|
| |
| virtual bool | is_empty () const |
| | Returns true when the dataset contains no samples.
|
| |
| Shape | get_input_shape () const |
| | Returns the configured input tensor shape.
|
| |
| Shape | get_target_shape () const |
| | Returns the configured target tensor shape.
|
| |
| const MatrixR & | get_data () const |
| | Returns the raw data matrix (rows = samples, columns = features).
|
| |
| void | set_data (const MatrixR &) |
| | Replaces the underlying data matrix.
|
| |
| void | set_data_constant (const float) |
| | Fills the data matrix with the given constant value.
|
| |
| void | set_default () |
| | Restores default dataset settings.
|
| |
| void | set_sample_roles (const string &) |
| | Assigns the same role to every sample.
|
| |
| void | set_sample_role (const Index, const string &) |
| | Sets the role of a single sample by index.
|
| |
| void | set_sample_roles (const vector< string > &) |
| | Assigns a role per sample from a string vector.
|
| |
| void | set_sample_roles (const vector< Index > &, const string &) |
| | Assigns the same role to the given sample indices.
|
| |
| void | set_variables (const vector< Variable > &new_variables) |
| |
| void | set_default_variable_names () |
| | Assigns default placeholder names to all variables.
|
| |
| void | set_variable_roles (const vector< string > &) |
| | Sets the role of every variable from the given string list.
|
| |
| void | set_variable_indices (const vector< Index > &, const vector< Index > &) |
| | Sets which variable indices are inputs and which are targets.
|
| |
| void | set_input_variables_unused () |
| | Marks all input variables as unused (role None).
|
| |
| void | set_variable_role (const Index, const string &) |
| | Sets the role of the variable at the given index.
|
| |
| void | set_variable_role (const string &, const string &) |
| | Sets the role of the variable with the given name.
|
| |
| void | set_variable_type (const Index, const VariableType &) |
| | Sets the type of the variable at the given index.
|
| |
| void | set_variable_type (const string &, const VariableType &) |
| | Sets the type of the variable with the given name.
|
| |
| void | set_variable_types (const VariableType &) |
| | Sets every variable to the given type.
|
| |
| void | set_binary_variables () |
| | Detects and marks variables with binary values as VariableType::Binary.
|
| |
| void | set_variable_names (const vector< string > &) |
| | Assigns names to all variables from the given list.
|
| |
| void | set_variables_number (const Index new_size) |
| |
| void | set_feature_names (const vector< string > &) |
| | Assigns expanded feature names, propagating categories back to variables.
|
| |
| void | set_variable_roles (const string &) |
| | Assigns the same role to every variable.
|
| |
| void | set_shape (const string &, const Shape &) |
| | Sets the tensor Shape associated with the given role ("Input"/"Target"/"Decoder").
|
| |
| virtual void | resize_input_shape (Index input_features_count) |
| | Resizes the input shape to the given flat feature count.
|
| |
| void | set_data_path (const filesystem::path &new_data_path) |
| |
| void | set_has_header (bool new_has_header) |
| |
| void | set_has_ids (bool new_has_ids) |
| |
| void | set_separator (const Separator &new_separator) |
| |
| void | set_separator_string (const string &) |
| | Sets the separator from its literal character.
|
| |
| void | set_separator_name (const string &) |
| | Sets the separator from its enumerator name.
|
| |
| void | set_codification (const Codification &new_codification) |
| |
| void | set_codification (const string &) |
| | Sets the codification from its enumerator name.
|
| |
| void | set_display (bool new_display) |
| |
| bool | is_sample_used (const Index i) const |
| | Returns true if the sample at i has a role other than None.
|
| |
| bool | has_binary_variables () const |
| | Returns true if any variable has type Binary.
|
| |
| bool | has_categorical_variables () const |
| | Returns true if any variable has type Categorical.
|
| |
| bool | has_binary_or_categorical_variables () const |
| | Returns true if any variable is Binary or Categorical.
|
| |
| bool | has_time_variable () const |
| | Returns true if any variable has role Time.
|
| |
| bool | has_validation () const |
| | Returns true if any sample is assigned the Validation role.
|
| |
| void | split_samples (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f, bool shuffle=true) |
| | Splits samples into Training/Validation/Testing roles, optionally shuffled.
|
| |
| void | split_samples_sequential (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f) |
| | Splits samples sequentially without shuffling.
|
| |
| void | split_samples_random (const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f) |
| | Splits samples randomly across roles.
|
| |
| vector< vector< Index > > | split_samples (const vector< Index > &, Index) const |
| | Splits the given indices into chunks of the requested size.
|
| |
| MatrixR | get_data (const string &, const string &) const |
| | Returns the data submatrix for the given sample role and feature role.
|
| |
| MatrixR | get_data_from_indices (const vector< Index > &, const vector< Index > &) const |
| | Returns a submatrix from the data using explicit row and column indices.
|
| |
| VectorR | get_sample_data (const Index) const |
| | Returns the row of the data matrix at the given sample index.
|
| |
| MatrixR | get_variable_data (const Index) const |
| | Returns the data columns belonging to the variable at index.
|
| |
| MatrixR | get_variable_data (const Index, const vector< Index > &) const |
| | Returns the variable data restricted to the given sample indices.
|
| |
| MatrixR | get_variable_data (const string &) const |
| | Returns the data columns of the variable with the given name.
|
| |
| MatrixR | get_feature_data (const string &) const |
| | Returns the data columns belonging to features with the given role name.
|
| |
| void | set (const Index=0, const Shape &={}, const Shape &={}) |
| | Resets the dataset with the given sample count and input/target shapes.
|
| |
| bool | has_nan () const |
| | Returns true if any entry in the data matrix is NaN.
|
| |
| bool | has_nan_row (const Index) const |
| | Returns true if the sample at the given row contains a NaN.
|
| |
| VectorI | count_nans_per_variable () const |
| | Returns the NaN count for each variable.
|
| |
| Index | count_variables_with_nan () const |
| | Returns the number of variables that contain at least one NaN.
|
| |
| Index | count_rows_with_nan () const |
| | Returns the number of rows that contain at least one NaN.
|
| |
| Index | count_nan () const |
| | Returns the total NaN count in the data matrix.
|
| |
| void | save (const filesystem::path &) const |
| | Saves the dataset metadata to a JSON file at the given path.
|
| |
| void | load (const filesystem::path &) |
| | Loads the dataset metadata from the JSON file at the given path.
|
| |
| void | save_data () const |
| | Saves the data matrix to the configured data path.
|
| |
| void | save_data_binary (const filesystem::path &) const |
| | Saves the data matrix to the given path in binary form.
|
| |
| void | load_data_binary () |
| | Loads the data matrix from the binary file at the configured data path.
|
| |
| virtual void | fill_inputs (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const |
| | Copies input features of the given samples into a destination buffer.
|
| |
| virtual void | augment_inputs (float *, Index) const |
| | Applies data augmentation in place to the input buffer (no-op in base class).
|
| |
| virtual void | fill_decoder (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const |
| | Copies decoder input features into a destination buffer (sequence models).
|
| |
| virtual void | fill_targets (const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const |
| | Copies target features of the given samples into a destination buffer.
|
| |