OpenNN
Open-source neural networks library
Loading...
Searching...
No Matches
tabular_dataset.h
Go to the documentation of this file.
1// OpenNN: Open Neural Networks Library
2// www.opennn.net
3//
4// T A B U L A R D A T A S E T C L A S S H E A D E R
5//
6// Artificial Intelligence Techniques SL
7// artelnics@artelnics.com
8
9#pragma once
10
11#include "dataset.h"
12#include "correlations.h"
13#include "statistics.h"
14
15namespace opennn
16{
17
19class TabularDataset : public Dataset
20{
21
22public:
23
26
31 TabularDataset(const Index = 0,
32 const Shape& = {0},
33 const Shape& = {0});
34
41 TabularDataset(const filesystem::path&,
42 const string&,
43 bool = true,
44 bool = false,
46
47 using Dataset::set;
54 void set(const filesystem::path&,
55 const string&,
56 bool = true,
57 bool = false,
60 void set(const filesystem::path&);
61
63 vector<string> get_feature_scalers(const string&) const;
64
66 void set_variable_scalers(const string&);
68 void set_variable_scalers(const vector<string>&);
71
72 void set_gmt(const Index new_gmt) { gmt = new_gmt; }
73
80 DateFormat infer_dataset_date_format(const vector<Variable>&, const vector<vector<string_view>>&, bool, const string&);
81
85 const string& get_missing_values_label() const { return missing_values_label; }
87
88 void set_missing_values_label(string label) { missing_values_label = move(label); }
91 void set_missing_values_method(const string&);
92
94 bool has_missing_values(const vector<string_view>&) const;
95
97 void scrub_missing_values() override;
106
109 vector<string> unuse_uncorrelated_variables(const float = 0.25f);
112 vector<string> unuse_collinear_variables(const float = 0.95f);
113
115 vector<Descriptives> calculate_feature_descriptives() const;
117 vector<Descriptives> calculate_feature_descriptives(const string&) const override;
118
124 vector<Descriptives> calculate_variable_descriptives_categories(const Index) const;
125
127 vector<Histogram> calculate_variable_distributions(const Index = 10) const;
129 vector<BoxPlot> calculate_variables_box_plots() const;
130
133 Correlation (*)(const MatrixR&, const MatrixR&), Correlation::Method, const string&) const;
135 Tensor<Correlation, 2> calculate_input_variable_pearson_correlations() const;
138
141 Correlation (*)(const MatrixR&, const MatrixR&), const string&) const;
143 Tensor<Correlation, 2> calculate_input_target_variable_pearson_correlations() const override;
146
149
151 vector<Descriptives> scale_data();
153 vector<Descriptives> scale_features(const string&) override;
155 void unscale_features(const string&, const vector<Descriptives>&) override;
156
163 vector<vector<Index>> calculate_Tukey_outliers(const float = 1.5f, bool = false);
165 vector<vector<Index>> replace_Tukey_outliers_with_NaN(const float = 1.5f);
167 void unuse_Tukey_outliers(const float = 1.5f);
168
170 void set_data_random() override;
172 void set_data_integer(const Index vocabulary_size) override;
177
178 void from_JSON(const JsonDocument&) override;
179 void to_JSON(JsonWriter&) const override;
180
182 void read_csv();
183
184protected:
185
186 string missing_values_label = "NA";
191
192 Index gmt = 0;
193
196
197 void infer_column_types(const vector<vector<string_view>>&);
198
199 vector<Index> filter_used_samples_by_column(Index, bool) const;
200 void apply_scaler(Index, const string&, const Descriptives&, bool);
201};
202
203}
204
205// OpenNN: Open Neural Networks Library.
206// Copyright(C) 2005-2026 Artificial Intelligence Techniques, SL.
207// Licensed under the GNU Lesser General Public License v2.1 or later.
Dataset()=default
void set(const Index=0, const Shape &={}, const Shape &={})
Resets the dataset with the given sample count and input/target shapes.
Codification
Text encoding of the source data file.
Definition dataset.h:66
@ UTF8
Definition dataset.h:66
Definition json.h:72
Definition json.h:85
Definition json.h:23
vector< Descriptives > scale_data()
Scales every feature column using its configured scaler; returns the applied descriptives.
void set(const filesystem::path &, const string &, bool=true, bool=false, const Codification &=Codification::UTF8)
Resets the dataset from the given file using the given parsing options.
void set_missing_values_label(string label)
Definition tabular_dataset.h:88
bool has_missing_values(const vector< string_view > &) const
Returns true if any field in the row matches the missing-values label.
Tensor< Correlation, 2 > calculate_input_target_variable_pearson_correlations() const override
Returns the Pearson correlation matrix between input and target variables.
void missing_values_from_JSON(const Json *)
vector< Descriptives > calculate_feature_descriptives(const string &) const override
Returns descriptive statistics for features with the given role name.
void missing_values_to_JSON(JsonWriter &) const
TabularDataset(const Index=0, const Shape &={0}, const Shape &={0})
Creates a tabular dataset with the given sample count and input/target shapes.
string get_missing_values_method_string() const
Returns the missing-values method as its enumerator name.
DateFormat infer_dataset_date_format(const vector< Variable > &, const vector< vector< string_view > > &, bool, const string &)
Infers the date/time format used in the given file preview rows.
void set_data_binary_classification()
Fills the data matrix with a synthetic binary classification dataset.
void read_csv()
Reads the configured CSV file into the dataset, inferring types and headers.
vector< Descriptives > calculate_variable_descriptives_negative_samples() const
Returns variable descriptives computed only over samples with negative target.
void apply_scaler(Index, const string &, const Descriptives &, bool)
void infer_column_types(const vector< vector< string_view > > &)
vector< vector< Index > > replace_Tukey_outliers_with_NaN(const float=1.5f)
Replaces detected Tukey outliers with NaN and returns the affected indices.
vector< Descriptives > calculate_variable_descriptives_positive_samples() const
Returns variable descriptives computed only over samples with positive target.
vector< BoxPlot > calculate_variables_box_plots() const
Returns per-variable Tukey box-plot summaries.
void from_JSON(const JsonDocument &) override
Loads dataset state from a JSON document.
Index get_missing_values_number() const
Definition tabular_dataset.h:86
void set_variable_scalers(const string &)
Sets every variable's scaler from the given name.
const string & get_missing_values_label() const
Definition tabular_dataset.h:85
Tensor< Correlation, 2 > calculate_input_variable_pearson_correlations() const
Returns the Pearson correlation matrix among input variables.
void calculate_missing_values_statistics()
Refreshes per-variable and per-row missing-value counts.
vector< vector< Index > > calculate_Tukey_outliers(const float=1.5f, bool=false)
Detects Tukey outliers per variable using the given fence multiplier.
MissingValuesMethod missing_values_method
Definition tabular_dataset.h:187
void to_JSON(JsonWriter &) const override
Writes dataset state to a JSON writer.
vector< string > unuse_uncorrelated_variables(const float=0.25f)
Marks input variables whose absolute correlation with targets is below the threshold as unused.
VectorI calculate_correlations_rank() const override
Returns input variable indices ranked by absolute correlation with the target.
vector< Descriptives > scale_features(const string &) override
Scales the features with the given role using their configured scalers.
void set_default_variable_scalers()
Assigns default scalers based on each variable's type.
Tensor< Correlation, 2 > calculate_input_variable_correlations(Correlation(*)(const MatrixR &, const MatrixR &), Correlation::Method, const string &) const
Returns the input-input correlation matrix using the given correlation function and method.
Tensor< Correlation, 2 > calculate_input_target_variable_correlations(Correlation(*)(const MatrixR &, const MatrixR &), const string &) const
Returns the input-target correlation matrix using the given correlation function.
vector< string > get_feature_scalers(const string &) const
Returns the scaler names for variables matching the given role.
virtual void impute_missing_values_interpolate()
Fills missing values by interpolating between neighboring samples.
void set_data_random() override
Fills the data matrix with random values.
MissingValuesMethod
Strategy for handling missing values in tabular data.
Definition tabular_dataset.h:25
@ Unuse
Definition tabular_dataset.h:25
@ Mean
Definition tabular_dataset.h:25
@ Interpolation
Definition tabular_dataset.h:25
@ Median
Definition tabular_dataset.h:25
vector< Descriptives > calculate_variable_descriptives_categories(const Index) const
Returns variable descriptives restricted to the given target class index.
Tensor< Correlation, 2 > calculate_input_variable_spearman_correlations() const
Returns the Spearman correlation matrix among input variables.
Index gmt
Definition tabular_dataset.h:192
void impute_missing_values_statistic(const MissingValuesMethod &)
Imputes missing values using the given statistic-based method (Mean/Median).
string missing_values_label
Definition tabular_dataset.h:186
void set_data_integer(const Index vocabulary_size) override
Fills the data matrix with random integers up to the given vocabulary size.
Index rows_missing_values_number
Definition tabular_dataset.h:190
TabularDataset(const filesystem::path &, const string &, bool=true, bool=false, const Codification &=Codification::UTF8)
Creates a tabular dataset by reading the given file with the given separator.
VectorI calculate_target_distribution() const override
Returns the distribution of target classes for classification datasets.
void set_missing_values_method(const MissingValuesMethod &method)
Definition tabular_dataset.h:89
void set_missing_values_method(const string &)
Sets the missing-values method from its enumerator name.
void set_gmt(const Index new_gmt)
Definition tabular_dataset.h:72
MissingValuesMethod get_missing_values_method() const
Definition tabular_dataset.h:82
Tensor< Correlation, 2 > calculate_input_target_variable_spearman_correlations() const
Returns the Spearman correlation matrix between input and target variables.
void set_variable_scalers(const vector< string > &)
Sets each variable's scaler from the corresponding name in the list.
void set_data_rosenbrock()
Fills the data matrix with samples drawn from the Rosenbrock function (regression test data).
vector< string > unuse_collinear_variables(const float=0.95f)
Marks input variables that are highly collinear (above the threshold) as unused.
Index missing_values_number
Definition tabular_dataset.h:188
void unuse_Tukey_outliers(const float=1.5f)
Marks samples containing Tukey outliers as unused.
void unscale_features(const string &, const vector< Descriptives > &) override
Reverts the scaling of features with the given role using the supplied descriptives.
VectorI variables_missing_values_number
Definition tabular_dataset.h:189
void set(const filesystem::path &)
Resets the dataset by reading from the given path with default parsing options.
vector< Histogram > calculate_variable_distributions(const Index=10) const
Returns per-variable histograms with the given bin count.
virtual void impute_missing_values_unuse()
Marks samples containing missing values as unused.
vector< Index > filter_used_samples_by_column(Index, bool) const
void scrub_missing_values() override
Removes or imputes missing values using the configured MissingValuesMethod.
vector< Descriptives > calculate_feature_descriptives() const
Returns descriptive statistics for every feature across all samples.
Definition adaptive_moment_estimation.h:14
DateFormat
Order of the day, month, and year fields in a date string (AUTO probes the input).
Definition string_utilities.h:44
Matrix< float, Dynamic, Dynamic, Layout > MatrixR
Definition pch.h:177
Matrix< Index, Dynamic, 1 > VectorI
Definition pch.h:182
Result of a correlation analysis: model parameters, fit quality, and the method/form used.
Definition correlations.h:18
Method
Underlying coefficient family used to compute r.
Definition correlations.h:20
Summary statistics (minimum, maximum, mean, standard deviation) for one variable.
Definition statistics.h:18
Fixed-capacity small-vector describing tensor dimensions (rank up to MaxRank).
Definition tensor_utilities.h:42