33 static const vector<pair<SampleRole, string>> entries = {
161 virtual void get_batches(
const vector<Index>&, Index,
bool, vector<vector<Index>>&)
const;
294 float selection_ratio = 0.2f,
295 float testing_ratio = 0.2f,
300 float selection_ratio = 0.2f,
301 float testing_ratio = 0.2f);
305 float selection_ratio = 0.2f,
306 float testing_ratio = 0.2f);
309 [[nodiscard]] vector<vector<Index>>
split_samples(
const vector<Index>&, Index)
const;
378 void save(
const filesystem::path&)
const;
380 void load(
const filesystem::path&);
397 const vector<Index>&,
400 bool parallelize =
true,
401 int contiguous = -1)
const;
408 const vector<Index>&,
411 bool parallelize =
true,
412 int contiguous = -1)
const;
416 const vector<Index>&,
419 bool parallelize =
true,
420 int contiguous = -1)
const;
454 const vector<string>
positive_words = {
"1",
"yes",
"positive",
"+",
"true",
"good",
"si",
"sÃ",
"SÃ"};
455 const vector<string>
negative_words = {
"0",
"no",
"negative",
"-",
"false",
"bad",
"not",
"No"};
void set_variable_type(const string &, const VariableType &)
Sets the type of the variable with the given name.
void split_samples_sequential(const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
Splits samples sequentially without shuffling.
vector< string > get_feature_names() const
Returns the expanded feature names (one entry per data matrix column).
Index count_nan() const
Returns the total NaN count in the data matrix.
MatrixR get_variable_data(const Index) const
Returns the data columns belonging to the variable at index.
MatrixR get_data(const string &, const string &) const
Returns the data submatrix for the given sample role and feature role.
void set_variable_indices(const vector< Index > &, const vector< Index > &)
Sets which variable indices are inputs and which are targets.
void preview_data_to_JSON(JsonWriter &) const
void set_input_variables_unused()
Marks all input variables as unused (role None).
void set_variables_number(const Index new_size)
Definition dataset.h:244
vector< Index > get_used_variables_indices() const
Returns the indices of all in-use variables.
void set_variable_types(const VariableType &)
Sets every variable to the given type.
void split_samples_random(const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f)
Splits samples randomly across roles.
void set_separator(const Separator &new_separator)
Definition dataset.h:261
vector< Index > get_sample_roles_vector() const
Returns the per-sample roles as integer indices.
MatrixR get_variable_data(const Index, const vector< Index > &) const
Returns the variable data restricted to the given sample indices.
MatrixR get_feature_data(const string &) const
Returns the data columns belonging to features with the given role name.
void set_sample_roles(const vector< Index > &, const string &)
Assigns the same role to the given sample indices.
virtual void fill_decoder(const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const
Copies decoder input features into a destination buffer (sequence models).
Codification codification
Definition dataset.h:449
void set_display(bool new_display)
Definition dataset.h:271
bool has_header
Definition dataset.h:447
virtual bool is_empty() const
Returns true when the dataset contains no samples.
Definition dataset.h:185
Index get_variables_number(const string &) const
Returns the number of variables with the given role name.
string get_separator_name() const
Returns the separator as its enumerator name ("Space", "Tab", ...).
void set_sample_roles(const vector< string > &)
Assigns a role per sample from a string vector.
MatrixR data
Definition dataset.h:443
void set_data(const MatrixR &)
Replaces the underlying data matrix.
Shape target_shape
Definition dataset.h:435
virtual vector< Descriptives > calculate_feature_descriptives(const string &) const
Returns descriptive statistics for features with the given role (subclass-specific).
Definition dataset.h:366
virtual void get_batches(const vector< Index > &, Index, bool, vector< vector< Index > > &) const
Splits sample indices into batches and writes them into batches.
virtual Index get_samples_number() const
Returns the total number of samples (rows) in the data matrix.
Definition dataset.h:74
bool has_sample_ids
Definition dataset.h:448
void set_codification(const Codification &new_codification)
Definition dataset.h:267
Shape get_target_shape() const
Returns the configured target tensor shape.
Definition dataset.h:190
Index get_variable_index(const string &) const
Returns the index of the variable with the given name.
Shape decoder_shape
Definition dataset.h:436
void load(const filesystem::path &)
Loads the dataset metadata from the JSON file at the given path.
const vector< SampleRole > & get_sample_roles() const
Returns the per-sample role vector.
Definition dataset.h:89
void set_variable_roles(const string &)
Assigns the same role to every variable.
void set_codification(const string &)
Sets the codification from its enumerator name.
MatrixR get_data_from_indices(const vector< Index > &, const vector< Index > &) const
Returns a submatrix from the data using explicit row and column indices.
string get_codification_string() const
Returns the codification as its enumerator name.
const vector< string > negative_words
Definition dataset.h:455
Index get_variable_index(const Index) const
Returns the variable index corresponding to a flat feature index.
void set_default()
Restores default dataset settings.
vector< string > sample_ids
Definition dataset.h:439
vector< SampleRole > sample_roles
Definition dataset.h:438
void set_variable_roles(const vector< string > &)
Sets the role of every variable from the given string list.
vector< Index > get_feature_indices(const string &) const
Returns the data column indices that belong to variables with the given role name.
void set(const Index=0, const Shape &={}, const Shape &={})
Resets the dataset with the given sample count and input/target shapes.
void load_data_binary()
Loads the data matrix from the binary file at the configured data path.
Index get_variables_number() const
Returns the total number of variables (columns descriptors).
Definition dataset.h:98
void save_data() const
Saves the data matrix to the configured data path.
vector< string > get_variable_names() const
Returns the names of all variables.
void set_binary_variables()
Detects and marks variables with binary values as VariableType::Binary.
virtual void from_JSON(const JsonDocument &)=0
Loads dataset state from a JSON document.
void set_default_variable_roles_forecasting()
const MatrixR & get_data() const
Returns the raw data matrix (rows = samples, columns = features).
Definition dataset.h:193
virtual void set_data_random()
Fills the data matrix with random values (no-op in base class).
Definition dataset.h:315
virtual VectorI calculate_correlations_rank() const
Returns input variables ranked by absolute correlation with the target.
Definition dataset.h:372
vector< vector< Index > > get_feature_indices() const
Returns the data column indices grouped per variable.
VectorI get_sample_role_numbers() const
Returns the per-sample roles as a tensor of integer indices.
void set_variable_role(const Index, const string &)
Sets the role of the variable at the given index.
Codification
Text encoding of the source data file.
Definition dataset.h:66
@ SHIFT_JIS
Definition dataset.h:66
@ UTF8
Definition dataset.h:66
Index get_used_samples_number() const
Returns the number of samples whose role is not None.
void save_data_binary(const filesystem::path &) const
Saves the data matrix to the given path in binary form.
virtual void fill_targets(const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const
Copies target features of the given samples into a destination buffer.
vector< Index > get_feature_dimensions() const
Returns the per-variable feature dimension counts.
bool has_binary_variables() const
Returns true if any variable has type Binary.
void set_data_path(const filesystem::path &new_data_path)
Definition dataset.h:256
void variables_from_JSON(const Json *)
void set_variable_type(const Index, const VariableType &)
Sets the type of the variable at the given index.
VectorR get_sample_data(const Index) const
Returns the row of the data matrix at the given sample index.
void set_default_variable_roles()
void infer_variable_types_from_data()
const vector< vector< string > > & get_data_file_preview() const
Returns the parsed preview rows captured during the last file read.
Definition dataset.h:164
void set_has_ids(bool new_has_ids)
Definition dataset.h:259
vector< Index > get_variable_indices(const string &) const
Returns the indices of variables matching the given role name.
void set_variable_names(const vector< string > &)
Assigns names to all variables from the given list.
Index count_variables_with_nan() const
Returns the number of variables that contain at least one NaN.
Index count_rows_with_nan() const
Returns the number of rows that contain at least one NaN.
vector< Index > get_feature_indices(const Index) const
Returns the data column indices that belong to the given variable index.
Separator separator
Definition dataset.h:446
void samples_from_JSON(const Json *)
vector< VariableType > get_variable_types(const vector< Index > &indices) const
Returns the VariableType for each of the given variable indices.
void set_shape(const string &, const Shape &)
Sets the tensor Shape associated with the given role ("Input"/"Target"/"Decoder").
Shape get_input_shape() const
Returns the configured input tensor shape.
Definition dataset.h:188
vector< Variable > variables
Definition dataset.h:441
virtual void set_data_integer(const Index)
Fills the data matrix with random integers up to the given vocabulary size.
Definition dataset.h:317
virtual void augment_inputs(float *, Index) const
Applies data augmentation in place to the input buffer (no-op in base class).
Definition dataset.h:404
void set_feature_names(const vector< string > &)
Assigns expanded feature names, propagating categories back to variables.
virtual vector< Descriptives > scale_features(const string &)
Scales the features with the configured method; returns the applied descriptives.
Definition dataset.h:312
virtual VectorI calculate_target_distribution() const
Returns the distribution of target values.
Definition dataset.h:370
virtual void unscale_features(const string &, const vector< Descriptives > &)
Reverts a previously applied scaling using the supplied descriptives.
Definition dataset.h:375
bool has_binary_or_categorical_variables() const
Returns true if any variable is Binary or Categorical.
Index get_samples_number(const string &) const
Returns the number of samples with the given role ("Training", "Validation", ...).
void set_default_variable_names()
Assigns default placeholder names to all variables.
vector< Index > get_used_feature_indices() const
Returns the data column indices that belong to in-use variables.
void set_variable_role(const string &, const string &)
Sets the role of the variable with the given name.
void set_sample_role(const Index, const string &)
Sets the role of a single sample by index.
vector< vector< string > > data_file_preview
Definition dataset.h:450
vector< string > get_feature_names(const string &) const
Returns the expanded feature names restricted to the given role.
vector< vector< Index > > split_samples(const vector< Index > &, Index) const
Splits the given indices into chunks of the requested size.
bool has_categorical_variables() const
Returns true if any variable has type Categorical.
MatrixR get_variable_data(const string &) const
Returns the data columns of the variable with the given name.
bool has_validation() const
Returns true if any sample is assigned the Validation role.
const vector< string > positive_words
Definition dataset.h:454
const Separator & get_separator() const
Returns the current field Separator.
Definition dataset.h:170
Index get_features_number(const string &) const
Returns the number of features for variables with the given role name.
Shape input_shape
Definition dataset.h:434
void set_separator_string(const string &)
Sets the separator from its literal character.
virtual void fill_inputs(const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int contiguous=-1) const
Copies input features of the given samples into a destination buffer.
const filesystem::path & get_data_path() const
Returns the configured data file path.
Definition dataset.h:167
void set_separator_name(const string &)
Sets the separator from its enumerator name.
Shape get_shape(const string &) const
Returns the configured Shape for the given role ("Input", "Target", "Decoder").
void set_variables(const vector< Variable > &new_variables)
Definition dataset.h:211
void set_data_constant(const float)
Fills the data matrix with the given constant value.
virtual ~Dataset()=default
vector< Index > get_used_sample_indices() const
Returns indices of all samples whose role is not None.
void read_data_file_preview(const vector< vector< string_view > > &)
virtual void resize_input_shape(Index input_features_count)
Resizes the input shape to the given flat feature count.
Definition dataset.h:255
vector< Index > get_sample_indices(const string &) const
Returns indices of samples with the given role name.
Index get_used_variables_number() const
Returns the number of variables whose role is in use.
bool has_time_variable() const
Returns true if any variable has role Time.
void samples_to_JSON(JsonWriter &) const
bool has_nan_row(const Index) const
Returns true if the sample at the given row contains a NaN.
bool has_nan() const
Returns true if any entry in the data matrix is NaN.
VectorI count_nans_per_variable() const
Returns the NaN count for each variable.
const vector< Variable > & get_variables() const
Returns the variable descriptors.
Definition dataset.h:105
bool get_display() const
Returns whether progress messages are printed.
Definition dataset.h:182
filesystem::path data_path
Definition dataset.h:445
Separator
Field separator used when reading delimited text files.
Definition dataset.h:71
@ Comma
Definition dataset.h:71
@ Tab
Definition dataset.h:71
@ Semicolon
Definition dataset.h:71
@ Space
Definition dataset.h:71
const Codification & get_codification() const
Returns the configured text Codification.
Definition dataset.h:177
string get_separator_string() const
Returns the separator as the literal character used in files.
vector< string > get_variable_names(const string &) const
Returns the names of variables with the given role name.
void set_sample_roles(const string &)
Assigns the same role to every sample.
void split_samples(const float training_ratio=0.6f, float selection_ratio=0.2f, float testing_ratio=0.2f, bool shuffle=true)
Splits samples into Training/Validation/Testing roles, optionally shuffled.
VariableType get_variable_type(const Index index) const
Returns the VariableType of the variable at the given index.
Definition dataset.h:125
virtual void to_JSON(JsonWriter &) const
Writes dataset state to a JSON writer.
Definition dataset.h:322
bool is_sample_used(const Index i) const
Returns true if the sample at i has a role other than None.
Definition dataset.h:274
void preview_data_from_JSON(const Json *)
void check_separators(string_view) const
virtual Tensor< Correlation, 2 > calculate_input_target_variable_pearson_correlations() const
Returns the Pearson correlations between input and target variables.
Definition dataset.h:368
Index get_used_features_number() const
Returns the number of features for in-use variables.
bool display
Definition dataset.h:452
vector< Variable > get_variables(const string &) const
Returns the variables with the given role name.
void set_has_header(bool new_has_header)
Definition dataset.h:258
void save(const filesystem::path &) const
Saves the dataset metadata to a JSON file at the given path.
void variables_to_JSON(JsonWriter &) const
virtual void scrub_missing_values()
Removes or imputes missing values using the configured strategy.
Definition dataset.h:363
Index get_features_number() const
Returns the total number of features (data matrix columns).
Definition adaptive_moment_estimation.h:14
const string & sample_role_to_string(SampleRole role)
Returns the canonical string name for a SampleRole.
Definition dataset.h:44
const EnumMap< SampleRole > & sample_role_map()
Returns the bidirectional string/enum map for SampleRole.
Definition dataset.h:31
SampleRole string_to_sample_role(const string &name)
Parses a string (name or "0"/"1"/"2"/"3") into the matching SampleRole.
Definition dataset.h:50
SampleRole
Role of a sample in a dataset split.
Definition dataset.h:23
@ Validation
Definition dataset.h:25
@ None
Definition dataset.h:27
@ Training
Definition dataset.h:24
@ Testing
Definition dataset.h:26
VariableType
Data type of a dataset Variable.
Definition variable.h:19
void shuffle(VectorB &vector_to_shuffle)
Randomly permutes the entries of a boolean vector in place.
Matrix< float, Dynamic, 1 > VectorR
Definition pch.h:181
Matrix< float, Dynamic, Dynamic, Layout > MatrixR
Definition pch.h:177
Matrix< Index, Dynamic, 1 > VectorI
Definition pch.h:182
const string & to_string(Enum value) const
Definition enum_map.h:23
Fixed-capacity small-vector describing tensor dimensions (rank up to MaxRank).
Definition tensor_utilities.h:42