54 void set(
const filesystem::path&,
60 void set(
const filesystem::path&);
void set(const Index=0, const Shape &={}, const Shape &={})
Resets the dataset with the given sample count and input/target shapes.
Codification
Text encoding of the source data file.
Definition dataset.h:66
@ UTF8
Definition dataset.h:66
vector< Descriptives > scale_data()
Scales every feature column using its configured scaler; returns the applied descriptives.
void set(const filesystem::path &, const string &, bool=true, bool=false, const Codification &=Codification::UTF8)
Resets the dataset from the given file using the given parsing options.
void set_missing_values_label(string label)
Definition tabular_dataset.h:88
bool has_missing_values(const vector< string_view > &) const
Returns true if any field in the row matches the missing-values label.
Tensor< Correlation, 2 > calculate_input_target_variable_pearson_correlations() const override
Returns the Pearson correlation matrix between input and target variables.
void missing_values_from_JSON(const Json *)
vector< Descriptives > calculate_feature_descriptives(const string &) const override
Returns descriptive statistics for features with the given role name.
void missing_values_to_JSON(JsonWriter &) const
TabularDataset(const Index=0, const Shape &={0}, const Shape &={0})
Creates a tabular dataset with the given sample count and input/target shapes.
string get_missing_values_method_string() const
Returns the missing-values method as its enumerator name.
DateFormat infer_dataset_date_format(const vector< Variable > &, const vector< vector< string_view > > &, bool, const string &)
Infers the date/time format used in the given file preview rows.
void set_data_binary_classification()
Fills the data matrix with a synthetic binary classification dataset.
void read_csv()
Reads the configured CSV file into the dataset, inferring types and headers.
vector< Descriptives > calculate_variable_descriptives_negative_samples() const
Returns variable descriptives computed only over samples with negative target.
void apply_scaler(Index, const string &, const Descriptives &, bool)
void infer_column_types(const vector< vector< string_view > > &)
vector< vector< Index > > replace_Tukey_outliers_with_NaN(const float=1.5f)
Replaces detected Tukey outliers with NaN and returns the affected indices.
vector< Descriptives > calculate_variable_descriptives_positive_samples() const
Returns variable descriptives computed only over samples with positive target.
vector< BoxPlot > calculate_variables_box_plots() const
Returns per-variable Tukey box-plot summaries.
void from_JSON(const JsonDocument &) override
Loads dataset state from a JSON document.
Index get_missing_values_number() const
Definition tabular_dataset.h:86
void set_variable_scalers(const string &)
Sets every variable's scaler from the given name.
const string & get_missing_values_label() const
Definition tabular_dataset.h:85
Tensor< Correlation, 2 > calculate_input_variable_pearson_correlations() const
Returns the Pearson correlation matrix among input variables.
void calculate_missing_values_statistics()
Refreshes per-variable and per-row missing-value counts.
vector< vector< Index > > calculate_Tukey_outliers(const float=1.5f, bool=false)
Detects Tukey outliers per variable using the given fence multiplier.
MissingValuesMethod missing_values_method
Definition tabular_dataset.h:187
void to_JSON(JsonWriter &) const override
Writes dataset state to a JSON writer.
vector< string > unuse_uncorrelated_variables(const float=0.25f)
Marks input variables whose absolute correlation with targets is below the threshold as unused.
VectorI calculate_correlations_rank() const override
Returns input variable indices ranked by absolute correlation with the target.
vector< Descriptives > scale_features(const string &) override
Scales the features with the given role using their configured scalers.
void set_default_variable_scalers()
Assigns default scalers based on each variable's type.
Tensor< Correlation, 2 > calculate_input_variable_correlations(Correlation(*)(const MatrixR &, const MatrixR &), Correlation::Method, const string &) const
Returns the input-input correlation matrix using the given correlation function and method.
Tensor< Correlation, 2 > calculate_input_target_variable_correlations(Correlation(*)(const MatrixR &, const MatrixR &), const string &) const
Returns the input-target correlation matrix using the given correlation function.
vector< string > get_feature_scalers(const string &) const
Returns the scaler names for variables matching the given role.
virtual void impute_missing_values_interpolate()
Fills missing values by interpolating between neighboring samples.
void set_data_random() override
Fills the data matrix with random values.
MissingValuesMethod
Strategy for handling missing values in tabular data.
Definition tabular_dataset.h:25
@ Unuse
Definition tabular_dataset.h:25
@ Mean
Definition tabular_dataset.h:25
@ Interpolation
Definition tabular_dataset.h:25
@ Median
Definition tabular_dataset.h:25
vector< Descriptives > calculate_variable_descriptives_categories(const Index) const
Returns variable descriptives restricted to the given target class index.
Tensor< Correlation, 2 > calculate_input_variable_spearman_correlations() const
Returns the Spearman correlation matrix among input variables.
Index gmt
Definition tabular_dataset.h:192
void impute_missing_values_statistic(const MissingValuesMethod &)
Imputes missing values using the given statistic-based method (Mean/Median).
string missing_values_label
Definition tabular_dataset.h:186
void set_data_integer(const Index vocabulary_size) override
Fills the data matrix with random integers up to the given vocabulary size.
Index rows_missing_values_number
Definition tabular_dataset.h:190
TabularDataset(const filesystem::path &, const string &, bool=true, bool=false, const Codification &=Codification::UTF8)
Creates a tabular dataset by reading the given file with the given separator.
VectorI calculate_target_distribution() const override
Returns the distribution of target classes for classification datasets.
void set_missing_values_method(const MissingValuesMethod &method)
Definition tabular_dataset.h:89
void set_missing_values_method(const string &)
Sets the missing-values method from its enumerator name.
void set_gmt(const Index new_gmt)
Definition tabular_dataset.h:72
MissingValuesMethod get_missing_values_method() const
Definition tabular_dataset.h:82
Tensor< Correlation, 2 > calculate_input_target_variable_spearman_correlations() const
Returns the Spearman correlation matrix between input and target variables.
void set_variable_scalers(const vector< string > &)
Sets each variable's scaler from the corresponding name in the list.
void set_data_rosenbrock()
Fills the data matrix with samples drawn from the Rosenbrock function (regression test data).
vector< string > unuse_collinear_variables(const float=0.95f)
Marks input variables that are highly collinear (above the threshold) as unused.
Index missing_values_number
Definition tabular_dataset.h:188
void unuse_Tukey_outliers(const float=1.5f)
Marks samples containing Tukey outliers as unused.
void unscale_features(const string &, const vector< Descriptives > &) override
Reverts the scaling of features with the given role using the supplied descriptives.
VectorI variables_missing_values_number
Definition tabular_dataset.h:189
void set(const filesystem::path &)
Resets the dataset by reading from the given path with default parsing options.
vector< Histogram > calculate_variable_distributions(const Index=10) const
Returns per-variable histograms with the given bin count.
virtual void impute_missing_values_unuse()
Marks samples containing missing values as unused.
vector< Index > filter_used_samples_by_column(Index, bool) const
void scrub_missing_values() override
Removes or imputes missing values using the configured MissingValuesMethod.
vector< Descriptives > calculate_feature_descriptives() const
Returns descriptive statistics for every feature across all samples.
Definition adaptive_moment_estimation.h:14
DateFormat
Order of the day, month, and year fields in a date string (AUTO probes the input).
Definition string_utilities.h:44
Matrix< float, Dynamic, Dynamic, Layout > MatrixR
Definition pch.h:177
Matrix< Index, Dynamic, 1 > VectorI
Definition pch.h:182
Result of a correlation analysis: model parameters, fit quality, and the method/form used.
Definition correlations.h:18
Method
Underlying coefficient family used to compute r.
Definition correlations.h:20
Summary statistics (minimum, maximum, mean, standard deviation) for one variable.
Definition statistics.h:18
Fixed-capacity small-vector describing tensor dimensions (rank up to MaxRank).
Definition tensor_utilities.h:42