DataSet Class Reference

This class represents the concept of data set for data modelling problems, such as approximation, classification or forecasting. More...

#include <data_set.h>

Classes

struct  Column
 This structure represents the columns of the DataSet. More...
 

Public Types

enum class  Separator { Space , Tab , Comma , Semicolon }
 Enumeration of available separators for the data file. More...
 
enum class  MissingValuesMethod { Unuse , Mean , Median }
 Enumeration of available methods for missing values in the data. More...
 
enum class  ProjectType { Approximation , Classification , Forecasting , ImageClassification }
 Enumeration of the learning tasks. More...
 
enum class  SampleUse { Training , Selection , Testing , UnusedSample }
 
enum class  VariableUse {
  Id , Input , Target , Time ,
  UnusedVariable
}
 
enum class  ColumnType {
  Numeric , Binary , Categorical , DateTime ,
  Constant
}
 

Public Member Functions

 DataSet ()
 
 DataSet (const Tensor< type, 2 > &)
 
 DataSet (const Index &, const Index &)
 
 DataSet (const Index &, const Index &, const Index &)
 
 DataSet (const string &, const char &, const bool &)
 
virtual ~DataSet ()
 Destructor. More...
 
Index get_samples_number () const
 
Index get_training_samples_number () const
 Returns the number of samples in the data set which will be used for training. More...
 
Index get_selection_samples_number () const
 Returns the number of samples in the data set which will be used for selection. More...
 
Index get_testing_samples_number () const
 Returns the number of samples in the data set which will be used for testing. More...
 
Index get_used_samples_number () const
 
Index get_unused_samples_number () const
 
Tensor< Index, 1 > get_training_samples_indices () const
 Returns the indices of the samples which will be used for training. More...
 
Tensor< Index, 1 > get_selection_samples_indices () const
 Returns the indices of the samples which will be used for selection. More...
 
Tensor< Index, 1 > get_testing_samples_indices () const
 Returns the indices of the samples which will be used for testing. More...
 
Tensor< Index, 1 > get_used_samples_indices () const
 Returns the indices of the used samples(those which are not set unused). More...
 
Tensor< Index, 1 > get_unused_samples_indices () const
 Returns the indices of the samples set unused. More...
 
SampleUse get_sample_use (const Index &) const
 
const Tensor< SampleUse, 1 > & get_samples_uses () const
 Returns the use of every sample (training, selection, testing or unused) in a vector. More...
 
Tensor< Index, 1 > get_samples_uses_numbers () const
 
Tensor< type, 1 > get_samples_uses_percentages () const
 
string get_sample_string (const Index &, const string &=",") const
 
Tensor< Column, 1 > get_columns () const
 Returns the columns of the data set. More...
 
Tensor< Column, 1 > get_time_series_columns () const
 
Index get_time_series_data_rows_number () const
 
Tensor< Column, 1 > get_input_columns () const
 Returns the input columns of the data set. More...
 
Tensor< bool, 1 > get_input_columns_binary () const
 Returns the input columns of the data set. More...
 
Tensor< Column, 1 > get_target_columns () const
 Returns the target columns of the data set. More...
 
Tensor< Column, 1 > get_used_columns () const
 Returns the used columns of the data set. More...
 
Index get_columns_number () const
 Returns the number of columns in the data set. More...
 
Index get_input_columns_number () const
 Returns the number of columns whose uses are Input. More...
 
Index get_input_time_series_columns_number () const
 
Index get_target_columns_number () const
 Returns the number of columns whose uses are Target. More...
 
Index get_target_time_series_columns_number () const
 
Index get_time_columns_number () const
 Returns the number of columns whose uses are Time. More...
 
Index get_unused_columns_number () const
 Returns the number of columns that are not used. More...
 
Index get_used_columns_number () const
 Returns the number of columns that are used. More...
 
Index get_column_index (const string &) const
 
Index get_column_index (const Index &) const
 
Tensor< Index, 1 > get_input_columns_indices () const
 Returns a indices vector with the positions of the inputs. More...
 
Tensor< Index, 1 > get_input_time_series_columns_indices () const
 
Tensor< Index, 1 > get_target_columns_indices () const
 Returns a indices vector with the positions of the targets. More...
 
Tensor< Index, 1 > get_target_time_series_columns_indices () const
 
Tensor< Index, 1 > get_unused_columns_indices () const
 Returns a indices vector with the positions of the unused columns. More...
 
Tensor< Index, 1 > get_used_columns_indices () const
 Returns a indices vector with the positions of the used columns. More...
 
Tensor< string, 1 > get_columns_names () const
 Returns a string vector that contains the names of the columns. More...
 
Tensor< string, 1 > get_input_columns_names () const
 Returns a string vector that contains the names of the columns whose uses are Input. More...
 
Tensor< string, 1 > get_target_columns_names () const
 Returns a string vector which contains the names of the columns whose uses are Target. More...
 
Tensor< string, 1 > get_used_columns_names () const
 Returns a string vector which contains the names of the columns used whether Input, Target or Time. More...
 
ColumnType get_column_type (const Index &index) const
 
VariableUse get_column_use (const Index &) const
 Returns a vector containing the use of the column, without taking into account the categories. More...
 
Tensor< VariableUse, 1 > get_columns_uses () const
 Returns the uses of each columns of the data set. More...
 
Index get_variables_number () const
 Returns the number of variables in the data set. More...
 
Index get_time_series_variables_number () const
 Returns the number of variables in the time series data. More...
 
Index get_input_variables_number () const
 
Index get_target_variables_number () const
 Returns the number of target variables of the data set. More...
 
Index get_unused_variables_number () const
 Returns the number of variables which will neither be used as input nor as target. More...
 
Index get_used_variables_number () const
 Returns the number of variables which are either input nor target. More...
 
string get_variable_name (const Index &) const
 
Tensor< string, 1 > get_variables_names () const
 
Tensor< string, 1 > get_time_series_variables_names () const
 
Tensor< string, 1 > get_input_variables_names () const
 
Tensor< string, 1 > get_target_variables_names () const
 
Index get_variable_index (const string &name) const
 
Tensor< Index, 1 > get_variable_indices (const Index &) const
 
Tensor< Index, 1 > get_unused_variables_indices () const
 Returns the indices of the unused variables. More...
 
Tensor< Index, 1 > get_used_variables_indices () const
 Returns the indices of the used variables. More...
 
Tensor< Index, 1 > get_input_variables_indices () const
 Returns the indices of the input variables. More...
 
Tensor< Index, 1 > get_target_variables_indices () const
 Returns the indices of the target variables. More...
 
VariableUse get_variable_use (const Index &) const
 
Tensor< VariableUse, 1 > get_variables_uses () const
 
const Tensor< Index, 1 > & get_input_variables_dimensions () const
 Returns the dimensions of the input variables. More...
 
Index get_input_variables_rank () const
 
Tensor< Scaler, 1 > get_columns_scalers () const
 
Tensor< Scaler, 1 > get_input_variables_scalers () const
 
Tensor< Scaler, 1 > get_target_variables_scalers () const
 
Tensor< Index, 2 > get_batches (const Tensor< Index, 1 > &, const Index &, const bool &, const Index &buffer_size=100) const
 
const Tensor< type, 2 > & get_data () const
 
Tensor< type, 2 > * get_data_pointer ()
 
const Tensor< type, 2 > & get_time_series_data () const
 
Tensor< type, 2 > get_training_data () const
 
Tensor< type, 2 > get_selection_data () const
 
Tensor< type, 2 > get_testing_data () const
 
Tensor< string, 1 > get_time_series_columns_names () const
 
Index get_time_series_columns_number () const
 Returns the number of columns in the time series. More...
 
Tensor< type, 2 > get_input_data () const
 
Tensor< type, 2 > get_target_data () const
 
Tensor< type, 2 > get_input_data (const Tensor< Index, 1 > &) const
 
Tensor< type, 2 > get_target_data (const Tensor< Index, 1 > &) const
 
Tensor< type, 2 > get_training_input_data () const
 
Tensor< type, 2 > get_training_target_data () const
 
Tensor< type, 2 > get_selection_input_data () const
 
Tensor< type, 2 > get_selection_target_data () const
 
Tensor< type, 2 > get_testing_input_data () const
 
Tensor< type, 2 > get_testing_target_data () const
 
Tensor< type, 1 > get_sample_data (const Index &) const
 
Tensor< type, 1 > get_sample_data (const Index &, const Tensor< Index, 1 > &) const
 
Tensor< type, 2 > get_sample_input_data (const Index &) const
 
Tensor< type, 2 > get_sample_target_data (const Index &) const
 
Tensor< type, 2 > get_column_data (const Index &) const
 
Tensor< type, 2 > get_column_data (const Index &, const Tensor< Index, 1 > &) const
 
Tensor< type, 2 > get_column_data (const Tensor< Index, 1 > &) const
 
Tensor< type, 2 > get_column_data (const string &) const
 
Tensor< type, 1 > get_variable_data (const Index &) const
 
Tensor< type, 1 > get_variable_data (const string &) const
 
Tensor< type, 1 > get_variable_data (const Index &, const Tensor< Index, 1 > &) const
 
Tensor< type, 1 > get_variable_data (const string &, const Tensor< Index, 1 > &) const
 
Tensor< Tensor< string, 1 >, 1 > get_data_file_preview () const
 
Tensor< type, 2 > get_subtensor_data (const Tensor< Index, 1 > &, const Tensor< Index, 1 > &) const
 
MissingValuesMethod get_missing_values_method () const
 Returns a string with the method used. More...
 
const string & get_data_file_name () const
 Returns the name of the data file. More...
 
const bool & get_header_line () const
 Returns true if the first line of the data file has a header with the names of the variables, and false otherwise. More...
 
const bool & get_rows_label () const
 Returns true if the data file has rows label, and false otherwise. More...
 
Tensor< string, 1 > get_rows_label_tensor () const
 
Tensor< string, 1 > get_selection_rows_label_tensor ()
 
Tensor< string, 1 > get_testing_rows_label_tensor ()
 
const Separatorget_separator () const
 Returns the separator to be used in the data file. More...
 
char get_separator_char () const
 Returns the string which will be used as separator in the data file. More...
 
string get_separator_string () const
 Returns the string which will be used as separator in the data file. More...
 
const string & get_missing_values_label () const
 Returns the string which will be used as label for the missing values in the data file. More...
 
const Index & get_lags_number () const
 Returns the number of lags to be used in a time series prediction application. More...
 
const Index & get_steps_ahead () const
 Returns the number of steps ahead to be used in a time series prediction application. More...
 
const string & get_time_column () const
 Returns the indices of the time variables in the data set. More...
 
Index get_time_series_time_column_index () const
 
Index get_gmt () const
 
const bool & get_display () const
 
void set ()
 Sets zero samples and zero variables in the data set. More...
 
void set (const Tensor< type, 2 > &)
 
void set (const Index &, const Index &)
 
void set (const Index &, const Index &, const Index &)
 
void set (const DataSet &)
 
void set (const tinyxml2::XMLDocument &)
 
void set (const string &)
 
void set (const string &, const char &, const bool &)
 
void set_default ()
 
void set_threads_number (const int &)
 
void set_samples_number (const Index &)
 
void set_training ()
 Sets all the samples in the data set for training. More...
 
void set_selection ()
 Sets all the samples in the data set for selection. More...
 
void set_testing ()
 Sets all the samples in the data set for testing. More...
 
void set_training (const Tensor< Index, 1 > &)
 
void set_selection (const Tensor< Index, 1 > &)
 
void set_testing (const Tensor< Index, 1 > &)
 
void set_samples_unused ()
 Sets all the samples in the data set for unused. More...
 
void set_samples_unused (const Tensor< Index, 1 > &)
 
void set_sample_use (const Index &, const SampleUse &)
 
void set_sample_use (const Index &, const string &)
 
void set_samples_uses (const Tensor< SampleUse, 1 > &)
 
void set_samples_uses (const Tensor< string, 1 > &)
 
void set_columns (const Tensor< Column, 1 > &)
 
void set_default_columns_uses ()
 
void set_default_columns_names ()
 
void set_column_name (const Index &, const string &)
 
void set_columns_uses (const Tensor< string, 1 > &)
 
void set_columns_uses (const Tensor< VariableUse, 1 > &)
 
void set_columns_unused ()
 Sets all columns in the data_set as unused columns. More...
 
void set_input_target_columns (const Tensor< Index, 1 > &, const Tensor< Index, 1 > &)
 
void set_input_columns_unused ()
 Sets all input columns in the data_set as unused columns. More...
 
void set_input_columns (const Tensor< Index, 1 > &, const Tensor< bool, 1 > &)
 
void set_column_use (const Index &, const VariableUse &)
 
void set_column_use (const string &, const VariableUse &)
 
void set_column_type (const Index &, const ColumnType &)
 
void set_column_type (const string &, const ColumnType &)
 
void set_columns_names (const Tensor< string, 1 > &)
 
void set_columns_number (const Index &)
 
void set_columns_scalers (const Scaler &)
 
void set_binary_simple_columns ()
 
void check_constant_columns ()
 
Tensor< type, 2 > transform_binary_column (const Tensor< type, 1 > &) const
 
void set_variables_names (const Tensor< string, 1 > &)
 
void set_variable_name (const Index &, const string &)
 
void set_input ()
 Sets all the variables in the data set as input variables. More...
 
void set_target ()
 Sets all the variables in the data set as target variables. More...
 
void set_variables_unused ()
 Sets all the variables in the data set as unused variables. More...
 
void set_input_variables_dimensions (const Tensor< Index, 1 > &)
 Sets new input dimensions in the data set. More...
 
void set_data (const Tensor< type, 2 > &)
 
void set_data_file_name (const string &)
 
void set_has_columns_names (const bool &)
 Sets if the data file contains a header with the names of the columns. More...
 
void set_has_rows_label (const bool &)
 Sets if the data file contains rows label. More...
 
void set_separator (const Separator &)
 
void set_separator (const string &)
 
void set_separator (const char &)
 
void set_missing_values_label (const string &)
 
void set_missing_values_method (const MissingValuesMethod &)
 
void set_missing_values_method (const string &)
 
void set_lags_number (const Index &)
 
void set_steps_ahead_number (const Index &)
 
void set_time_column (const string &)
 
void set_gmt (Index &)
 
void set_display (const bool &)
 
bool is_empty () const
 Returns true if the data matrix is empty, and false otherwise. More...
 
bool is_sample_used (const Index &) const
 
bool is_sample_unused (const Index &) const
 
bool has_binary_columns () const
 
bool has_categorical_columns () const
 
bool has_time_columns () const
 
bool has_time_time_series_columns () const
 
bool has_selection () const
 
void split_samples_sequential (const type &training_ratio=static_cast< type >(0.6), const type &selection_ratio=static_cast< type >(0.2), const type &testing_ratio=static_cast< type >(0.2))
 
void split_samples_random (const type &training_ratio=static_cast< type >(0.6), const type &selection_ratio=static_cast< type >(0.2), const type &testing_ratio=static_cast< type >(0.2))
 
Tensor< string, 1 > unuse_constant_columns ()
 
Tensor< Index, 1 > unuse_repeated_samples ()
 
Tensor< string, 1 > unuse_uncorrelated_columns (const type &=type(0.25))
 
void set_data_constant (const type &)
 
void set_data_random ()
 
void set_data_binary_random ()
 
Tensor< Descriptives, 1 > calculate_variables_descriptives () const
 
Tensor< Descriptives, 1 > calculate_used_variables_descriptives () const
 
Tensor< Descriptives, 1 > calculate_columns_descriptives_positive_samples () const
 Calculate the descriptives of the samples with positive targets in binary classification problems. More...
 
Tensor< Descriptives, 1 > calculate_columns_descriptives_negative_samples () const
 Calculate the descriptives of the samples with neagtive targets in binary classification problems. More...
 
Tensor< Descriptives, 1 > calculate_columns_descriptives_categories (const Index &) const
 
Tensor< Descriptives, 1 > calculate_columns_descriptives_training_samples () const
 
Tensor< Descriptives, 1 > calculate_columns_descriptives_selection_samples () const
 
Tensor< Descriptives, 1 > calculate_input_variables_descriptives () const
 
Tensor< Descriptives, 1 > calculate_target_variables_descriptives () const
 
Tensor< Descriptives, 1 > calculate_testing_target_variables_descriptives () const
 
Tensor< type, 1 > calculate_input_variables_minimums () const
 Returns a vector containing the minimums of the input variables. More...
 
Tensor< type, 1 > calculate_target_variables_minimums () const
 Returns a vector containing the minimums of the target variables. More...
 
Tensor< type, 1 > calculate_input_variables_maximums () const
 Returns a vector containing the maximums of the input variables. More...
 
Tensor< type, 1 > calculate_target_variables_maximums () const
 Returns a vector containing the maximums of the target variables. More...
 
Tensor< type, 1 > calculate_variables_means (const Tensor< Index, 1 > &) const
 
Tensor< type, 1 > calculate_used_variables_minimums () const
 Returns a vector containing the maximum of the used variables. More...
 
Tensor< type, 1 > calculate_used_targets_mean () const
 
Tensor< type, 1 > calculate_selection_targets_mean () const
 Returns the mean values of the target variables on the selection. More...
 
Index calculate_used_negatives (const Index &) const
 
Index calculate_training_negatives (const Index &) const
 
Index calculate_selection_negatives (const Index &) const
 
Index calculate_testing_negatives (const Index &) const
 
Tensor< Histogram, 1 > calculate_columns_distribution (const Index &=10) const
 
Tensor< BoxPlot, 1 > calculate_columns_box_plots () const
 
Tensor< Correlation, 2 > calculate_input_columns_correlations () const
 
void print_inputs_correlations () const
 Print on screen the correlation between variables in the data set. More...
 
void print_top_inputs_correlations () const
 
Tensor< Correlation, 2 > calculate_input_target_columns_correlations () const
 
void print_input_target_columns_correlations () const
 Print on screen the correlation between targets and inputs. More...
 
void print_top_input_target_columns_correlations () const
 
Tensor< Index, 1 > filter_data (const Tensor< type, 1 > &, const Tensor< type, 1 > &)
 
void set_default_columns_scalers ()
 
Tensor< Descriptives, 1 > scale_data ()
 
Tensor< Descriptives, 1 > scale_input_variables ()
 
Tensor< Descriptives, 1 > scale_target_variables ()
 
void unscale_data (const Tensor< Descriptives, 1 > &)
 
void unscale_input_variables (const Tensor< Descriptives, 1 > &)
 
void unscale_target_variables (const Tensor< Descriptives, 1 > &)
 
Tensor< Index, 1 > calculate_target_distribution () const
 
Tensor< Tensor< Index, 1 >, 1 > calculate_Tukey_outliers (const type &=type(1.5)) const
 
void unuse_Tukey_outliers (const type &=type(1.5))
 
Tensor< Index, 1 > calculate_local_outlier_factor_outliers (const Index &=20, const Index &=0, const type &=type(0)) const
 
void unuse_local_outlier_factor_outliers (const Index &=20, const type &=type(1.5))
 
Tensor< Index, 1 > calculate_isolation_forest_outliers (const Index &=100, const Index &=256, const type &=type(0)) const
 
void unuse_isolation_forest_outliers (const Index &=20, const type &=type(1.5))
 
void transform_time_series ()
 Arranges an input-target DataSet from a time series matrix, according to the number of lags. More...
 
void transform_time_series_columns ()
 This method transforms the columns into time series for forecasting problems. More...
 
void transform_time_series_data ()
 
void get_time_series_columns_number (const Index &)
 
void set_time_series_data (const Tensor< type, 2 > &)
 
void set_time_series_columns_number (const Index &)
 
Tensor< type, 2 > get_time_series_column_data (const Index &) const
 
Tensor< type, 2 > calculate_autocorrelations (const Index &=10) const
 
Tensor< type, 3 > calculate_cross_correlations (const Index &=10) const
 Calculates the cross-correlation between all the variables in the data set. More...
 
void generate_constant_data (const Index &, const Index &, const type &)
 
void generate_random_data (const Index &, const Index &)
 
void generate_sequential_data (const Index &, const Index &)
 
void generate_Rosenbrock_data (const Index &, const Index &)
 
void generate_sum_data (const Index &, const Index &)
 
void print () const
 Prints to the screen in text format the main numbers from the data set object. More...
 
void from_XML (const tinyxml2::XMLDocument &)
 
void write_XML (tinyxml2::XMLPrinter &) const
 Serializes the data set object into a XML document of the TinyXML library without keep the DOM tree in memory. More...
 
void save (const string &) const
 
void load (const string &)
 
void print_columns () const
 
void print_columns_types () const
 
void print_columns_uses () const
 
void print_data () const
 Prints to the screen the values of the data matrix. More...
 
void print_data_preview () const
 
void print_data_file_preview () const
 
void save_data () const
 Saves to the data file the values of the data matrix. More...
 
void save_data_binary (const string &) const
 Saves to the data file the values of the data matrix in binary format. More...
 
void save_time_series_data_binary (const string &) const
 Saves to the data file the values of the time series data matrix in binary format. More...
 
void read_csv ()
 
void load_data_binary ()
 This method loads the data from a binary data file. More...
 
void load_time_series_data_binary (const string &)
 This method loads time series data from a binary data. More...
 
void check_input_csv (const string &, const char &) const
 This method checks if the input data file has the correct format. Returns an error message. More...
 
Tensor< type, 2 > read_input_csv (const string &, const char &, const string &, const bool &, const bool &) const
 This method loads data from a file and returns a matrix containing the input columns. More...
 
void fill_time_series (const Index &)
 
bool has_nan () const
 Returns true if the data contain missing values. More...
 
bool has_nan_row (const Index &) const
 Returns true if the given row contains missing values. More...
 
void print_missing_values_information () const
 
void impute_missing_values_unuse ()
 Sets all the samples with missing values to "Unused". More...
 
void impute_missing_values_mean ()
 Substitutes all the missing values by the mean of the corresponding variable. More...
 
void impute_missing_values_median ()
 Substitutes all the missing values by the median of the corresponding variable. More...
 
void scrub_missing_values ()
 
Tensor< Index, 1 > count_nan_columns () const
 
Index count_rows_with_nan () const
 
Index count_nan () const
 
void set_missing_values_number (const Index &)
 
void set_missing_values_number ()
 
void set_columns_missing_values_number (const Tensor< Index, 1 > &)
 
void set_columns_missing_values_number ()
 
void set_rows_missing_values_number (const Index &)
 
void set_rows_missing_values_number ()
 
void fix_repeated_names ()
 
Tensor< Index, 1 > push_back (const Tensor< Index, 1 > &, const Index &) const
 
Tensor< string, 1 > push_back (const Tensor< string, 1 > &, const string &) const
 
void initialize_sequential (Tensor< Index, 1 > &, const Index &, const Index &, const Index &) const
 
void intialize_sequential (Tensor< type, 1 > &, const type &, const type &, const type &) const
 
Tensor< Index, 2 > split_samples (const Tensor< Index, 1 > &, const Index &) const
 
bool get_has_rows_labels () const
 
void shuffle ()
 
void read_csv_1 ()
 
void read_csv_2_simple ()
 
void read_csv_3_simple ()
 
void read_csv_2_complete ()
 
void read_csv_3_complete ()
 
void check_separators (const string &) const
 
void check_special_characters (const string &) const
 

Static Public Member Functions

static Tensor< string, 1 > get_default_columns_names (const Index &)
 
static Scaler get_scaling_unscaling_method (const string &)
 

Private Member Functions

Tensor< Index, 1 > select_outliers_via_standard_deviation (const Tensor< type, 1 > &, const type &=type(2.0), bool=true) const
 
Tensor< Index, 1 > select_outliers_via_contamination (const Tensor< type, 1 > &, const type &=type(0.05), bool=true) const
 
type calculate_euclidean_distance (const Tensor< Index, 1 > &, const Index &, const Index &) const
 
Tensor< type, 2 > calculate_distance_matrix (const Tensor< Index, 1 > &) const
 
Tensor< list< Index >, 1 > calculate_k_nearest_neighbors (const Tensor< type, 2 > &, const Index &=20) const
 
Tensor< Tensor< type, 1 >, 1 > get_kd_tree_data () const
 
Tensor< Tensor< Index, 1 >, 1 > create_bounding_limits_kd_tree (const Index &) const
 
void create_kd_tree (Tensor< Tensor< type, 1 >, 1 > &, const Tensor< Tensor< Index, 1 >, 1 > &) const
 
Tensor< list< Index >, 1 > calculate_bounding_boxes_neighbors (const Tensor< Tensor< type, 1 >, 1 > &, const Tensor< Index, 1 > &, const Index &, const Index &) const
 
Tensor< list< Index >, 1 > calculate_kd_tree_neighbors (const Index &=20, const Index &=40) const
 
Tensor< type, 1 > calculate_average_reachability (Tensor< list< Index >, 1 > &, const Index &) const
 
Tensor< type, 1 > calculate_local_outlier_factor (Tensor< list< Index >, 1 > &, const Tensor< type, 1 > &, const Index &) const
 
void calculate_min_max_indices_list (list< Index > &, const Index &, type &, type &) const
 
Index split_isolation_tree (Tensor< type, 2 > &, list< list< Index > > &, list< Index > &) const
 
Tensor< type, 2 > create_isolation_tree (const Tensor< Index, 1 > &, const Index &) const
 
Tensor< Tensor< type, 2 >, 1 > create_isolation_forest (const Index &, const Index &, const Index &) const
 
type calculate_tree_path (const Tensor< type, 2 > &, const Index &, const Index &) const
 
Tensor< type, 1 > calculate_average_forest_paths (const Tensor< Tensor< type, 2 >, 1 > &, const Index &) const
 

Private Attributes

NonBlockingThreadPool * non_blocking_thread_pool = nullptr
 
ThreadPoolDevice * thread_pool_device = nullptr
 
Tensor< type, 2 > data
 
Tensor< SampleUse, 1 > samples_uses
 
Tensor< string, 1 > rows_labels
 
Tensor< Column, 1 > columns
 
Tensor< Index, 1 > input_variables_dimensions
 
string data_file_name
 Data file name. More...
 
Separator separator = Separator::Comma
 Separator character. More...
 
string missing_values_label = "NA"
 Missing values label. More...
 
bool has_columns_names = false
 Header which contains variables name. More...
 
bool has_rows_labels = false
 Header which contains the rows label. More...
 
Tensor< Tensor< string, 1 >, 1 > data_file_preview
 
string time_column
 Index where time variable is located for forecasting applications. More...
 
Index lags_number = 0
 Number of lags. More...
 
Index steps_ahead = 0
 Number of steps ahead. More...
 
Tensor< type, 2 > time_series_data
 
Tensor< Column, 1 > time_series_columns
 
Index gmt = 0
 
MissingValuesMethod missing_values_method = MissingValuesMethod::Unuse
 Missing values method. More...
 
Index missing_values_number
 Missing values. More...
 
Tensor< Index, 1 > columns_missing_values_number
 
Index rows_missing_values_number
 
bool display = true
 Display messages to screen. More...
 
const Eigen::array< IndexPair< Index >, 1 > product_vector_vector = {IndexPair<Index>(0, 0)}
 

Detailed Description

This class represents the concept of data set for data modelling problems, such as approximation, classification or forecasting.

It basically consists of a data Matrix separated by columns. These columns can take different categories depending on the data hosted in them.

With OpenNN DataSet class you can edit the data to prepare your model, such as eliminating missing values, calculating correlations between variables (inputs and targets), not using certain variables or samples, etc \dots.

Definition at line 56 of file data_set.h.

Member Enumeration Documentation

◆ ColumnType

enum class ColumnType
strong

This enumeration represents the data type of a column (numeric, binary, categorical or time).

Definition at line 104 of file data_set.h.

◆ MissingValuesMethod

enum class MissingValuesMethod
strong

Enumeration of available methods for missing values in the data.

Definition at line 85 of file data_set.h.

◆ ProjectType

enum class ProjectType
strong

Enumeration of the learning tasks.

Definition at line 89 of file data_set.h.

◆ SampleUse

enum class SampleUse
strong

This enumeration represents the possible uses of an sample (training, selection, testing or unused).

Definition at line 94 of file data_set.h.

◆ Separator

enum class Separator
strong

Enumeration of available separators for the data file.

Definition at line 81 of file data_set.h.

◆ VariableUse

enum class VariableUse
strong

This enumeration represents the possible uses of an variable (input, target, time or unused).

Definition at line 99 of file data_set.h.

Constructor & Destructor Documentation

◆ DataSet() [1/5]

DataSet ( )
explicit

Default constructor. It creates a data set object with zero samples and zero inputs and target variables. It also initializes the rest of class members to their default values.

Definition at line 20 of file data_set.cpp.

◆ DataSet() [2/5]

DataSet ( const Tensor< type, 2 > &  data)
explicit

Default constructor. It creates a data set object from data Eigen Matrix. It also initializes the rest of class members to their default values.

Parameters
dataData Tensor<type, 2>.

Definition at line 32 of file data_set.cpp.

◆ DataSet() [3/5]

DataSet ( const Index &  new_samples_number,
const Index &  new_variables_number 
)
explicit

Samples and variables number constructor. It creates a data set object with given samples and variables numbers. All the variables are set as inputs. It also initializes the rest of class members to their default values.

Parameters
new_samples_numberNumber of samples in the data set.
new_variables_numberNumber of variables.

Definition at line 47 of file data_set.cpp.

◆ DataSet() [4/5]

DataSet ( const Index &  new_samples_number,
const Index &  new_inputs_number,
const Index &  new_targets_number 
)
explicit

Samples number, input variables number and target variables number constructor. It creates a data set object with given samples and inputs and target variables numbers. It also initializes the rest of class members to their default values.

Parameters
new_samples_numberNumber of samples in the data set.
new_inputs_numberNumber of input variables.
new_targets_numberNumber of target variables.

Definition at line 62 of file data_set.cpp.

◆ DataSet() [5/5]

DataSet ( const string &  data_file_name,
const char &  separator,
const bool &  has_columns_names 
)
explicit

File and separator constructor. It creates a data set object by loading the object members from a data file. It also sets a separator. Please mind about the file format. This is specified in the User's Guide.

Parameters
data_file_nameData file file name.
separatorData file file name.

Definition at line 76 of file data_set.cpp.

◆ ~DataSet()

~DataSet ( )
virtual

Destructor.

Definition at line 84 of file data_set.cpp.

Member Function Documentation

◆ calculate_autocorrelations()

Tensor< type, 2 > calculate_autocorrelations ( const Index &  lags_number = 10) const

Returns a matrix with the values of autocorrelation for every variable in the data set. The number of rows is equal to the number of The number of columns is the maximum lags number.

Parameters
maximum_lags_numberMaximum lags number for which autocorrelation is calculated.

Definition at line 9315 of file data_set.cpp.

◆ calculate_average_forest_paths()

Tensor< type, 1 > calculate_average_forest_paths ( const Tensor< Tensor< type, 2 >, 1 > &  forest,
const Index &  tree_depth 
) const
private

Definition at line 9269 of file data_set.cpp.

◆ calculate_average_reachability()

Tensor< type, 1 > calculate_average_reachability ( Tensor< list< Index >, 1 > &  k_nearest_indexes,
const Index &  k 
) const
private

Definition at line 8935 of file data_set.cpp.

◆ calculate_bounding_boxes_neighbors()

Tensor< list< Index >, 1 > calculate_bounding_boxes_neighbors ( const Tensor< Tensor< type, 1 >, 1 > &  tree,
const Tensor< Index, 1 > &  leaves_indices,
const Index &  depth,
const Index &  k_neighbors 
) const
private

Definition at line 8874 of file data_set.cpp.

◆ calculate_columns_box_plots()

Tensor< BoxPlot, 1 > calculate_columns_box_plots ( ) const

Returns a vector of subvectors with the values of a box and whiskers plot. The size of the vector is equal to the number of used variables. The size of the subvectors is 5 and they consist on:

  • Minimum
  • First quartile
  • Second quartile
  • Third quartile
  • Maximum

Definition at line 5320 of file data_set.cpp.

◆ calculate_columns_descriptives_categories()

Tensor< Descriptives, 1 > calculate_columns_descriptives_categories ( const Index &  class_index) const

Returns a matrix with the data set descriptive statistics.

Parameters
class_indexData set index number to make the descriptive statistics.

Definition at line 5645 of file data_set.cpp.

◆ calculate_columns_descriptives_negative_samples()

Tensor< Descriptives, 1 > calculate_columns_descriptives_negative_samples ( ) const

Calculate the descriptives of the samples with neagtive targets in binary classification problems.

Definition at line 5584 of file data_set.cpp.

◆ calculate_columns_descriptives_positive_samples()

Tensor< Descriptives, 1 > calculate_columns_descriptives_positive_samples ( ) const

Calculate the descriptives of the samples with positive targets in binary classification problems.

Definition at line 5525 of file data_set.cpp.

◆ calculate_columns_descriptives_selection_samples()

Tensor< Descriptives, 1 > calculate_columns_descriptives_selection_samples ( ) const

Returns a vector of vectors containing some basic descriptives of all variables on the selection The size of this vector is two. The subvectors are:

  • Selection data minimum.
  • Selection data maximum.
  • Selection data mean.
  • Selection data standard deviation.

Definition at line 5712 of file data_set.cpp.

◆ calculate_columns_descriptives_training_samples()

Tensor< Descriptives, 1 > calculate_columns_descriptives_training_samples ( ) const

Returns a vector of vectors containing some basic descriptives of all variables on the training The size of this vector is two. The subvectors are:

  • Training data minimum.
  • Training data maximum.
  • Training data mean.
  • Training data standard deviation.

Definition at line 5693 of file data_set.cpp.

◆ calculate_columns_distribution()

Tensor< Histogram, 1 > calculate_columns_distribution ( const Index &  bins_number = 10) const

Returns the distribution of each of the columns. In the case of numeric columns, it returns a histogram, for the case of categorical columns, it returns the frequencies of each category and for the binary columns it returns the frequencies of the positives and negatives. The default number of bins is 10.

Parameters
bins_numberNumber of bins.

Definition at line 5201 of file data_set.cpp.

◆ calculate_cross_correlations()

Tensor< type, 3 > calculate_cross_correlations ( const Index &  lags_number = 10) const

Calculates the cross-correlation between all the variables in the data set.

Definition at line 9424 of file data_set.cpp.

◆ calculate_distance_matrix()

Tensor< type, 2 > calculate_distance_matrix ( const Tensor< Index, 1 > &  indices) const
private

Definition at line 8727 of file data_set.cpp.

◆ calculate_euclidean_distance()

type calculate_euclidean_distance ( const Tensor< Index, 1 > &  variables_indices,
const Index &  sample_index,
const Index &  other_sample_index 
) const
private

Definition at line 8707 of file data_set.cpp.

◆ calculate_input_columns_correlations()

Tensor< Correlation, 2 > calculate_input_columns_correlations ( ) const

Calculate the correlation between each input in the data set. Returns a matrix with the correlation values between variables in the data set.

Definition at line 6019 of file data_set.cpp.

◆ calculate_input_target_columns_correlations()

Tensor< Correlation, 2 > calculate_input_target_columns_correlations ( ) const

Calculates the correlations between all outputs and all inputs. It returns a matrix with the data stored in CorrelationsResults format, where the number of rows is the input number and number of columns is the target number. Each element contains the correlation between a single input and a single target.

Definition at line 5876 of file data_set.cpp.

◆ calculate_input_variables_descriptives()

Tensor< Descriptives, 1 > calculate_input_variables_descriptives ( ) const

Returns a vector of Descriptives structures with some basic statistics of the input variables on the used This includes the minimum, maximum, mean and standard deviation. The size of this vector is the number of inputs.

Definition at line 5726 of file data_set.cpp.

◆ calculate_input_variables_maximums()

Tensor< type, 1 > calculate_input_variables_maximums ( ) const

Returns a vector containing the maximums of the input variables.

Definition at line 5784 of file data_set.cpp.

◆ calculate_input_variables_minimums()

Tensor< type, 1 > calculate_input_variables_minimums ( ) const

Returns a vector containing the minimums of the input variables.

Definition at line 5767 of file data_set.cpp.

◆ calculate_isolation_forest_outliers()

Tensor< Index, 1 > calculate_isolation_forest_outliers ( const Index &  n_trees = 100,
const Index &  subs_set_samples = 256,
const type &  contamination = type(0) 
) const

Definition at line 9288 of file data_set.cpp.

◆ calculate_k_nearest_neighbors()

Tensor< list< Index >, 1 > calculate_k_nearest_neighbors ( const Tensor< type, 2 > &  distance_matrix,
const Index &  k_neighbors = 20 
) const
private

Definition at line 8751 of file data_set.cpp.

◆ calculate_kd_tree_neighbors()

Tensor< list< Index >, 1 > calculate_kd_tree_neighbors ( const Index &  k_neighbors = 20,
const Index &  min_samples_leaf = 40 
) const
private

Definition at line 8915 of file data_set.cpp.

◆ calculate_local_outlier_factor()

Tensor< type, 1 > calculate_local_outlier_factor ( Tensor< list< Index >, 1 > &  k_nearest_indexes,
const Tensor< type, 1 > &  average_reachabilities,
const Index &  k 
) const
private

Definition at line 8971 of file data_set.cpp.

◆ calculate_local_outlier_factor_outliers()

Tensor< Index, 1 > calculate_local_outlier_factor_outliers ( const Index &  k_neighbors = 20,
const Index &  min_samples_leaf = 0,
const type &  contamination = type(0) 
) const

Calculate the outliers from the data set using the LocalOutlierFactor method.

Parameters
k_neighborsUsed to perform a k_nearest_algorithm to find the local density. Default is 20.
min_samples_leafThe minimum number of samples per leaf when building a KDTree. If 0, automatically decide between using brute force aproach or KDTree. If > samples_number/2, brute force aproach is performed. Default is 0.
contaminationPercentage of outliers in the data_set to be selected. If 0.0, those paterns which deviates from the mean of LOF more than 2 times are considered outlier. Default is 0.0.

Definition at line 9002 of file data_set.cpp.

◆ calculate_min_max_indices_list()

void calculate_min_max_indices_list ( list< Index > &  elements,
const Index &  variable_index,
type &  min,
type &  max 
) const
private

Definition at line 9069 of file data_set.cpp.

◆ calculate_selection_negatives()

Index calculate_selection_negatives ( const Index &  target_index) const

Counts the number of negatives of the selected target in the selection data.

Parameters
target_indexIndex of the target to evaluate.

Definition at line 5433 of file data_set.cpp.

◆ calculate_selection_targets_mean()

Tensor< type, 1 > calculate_selection_targets_mean ( ) const

Returns the mean values of the target variables on the selection.

Definition at line 5843 of file data_set.cpp.

◆ calculate_target_distribution()

Tensor< Index, 1 > calculate_target_distribution ( ) const

Returns a vector containing the number of samples of each class in the data set. If the number of target variables is one then the number of classes is two. If the number of target variables is greater than one then the number of classes is equal to the number of target variables.

Todo:
Return class_distribution is wrong

Definition at line 8480 of file data_set.cpp.

◆ calculate_target_variables_descriptives()

Tensor< Descriptives, 1 > calculate_target_variables_descriptives ( ) const

Returns a vector of vectors with some basic descriptives of the target variables on all The size of this vector is four. The subvectors are:

  • Target variables minimum.
  • Target variables maximum.
  • Target variables mean.
  • Target variables standard deviation.

Definition at line 5745 of file data_set.cpp.

◆ calculate_target_variables_maximums()

Tensor< type, 1 > calculate_target_variables_maximums ( ) const

Returns a vector containing the maximums of the target variables.

Definition at line 5792 of file data_set.cpp.

◆ calculate_target_variables_minimums()

Tensor< type, 1 > calculate_target_variables_minimums ( ) const

Returns a vector containing the minimums of the target variables.

Definition at line 5775 of file data_set.cpp.

◆ calculate_testing_negatives()

Index calculate_testing_negatives ( const Index &  target_index) const

Counts the number of negatives of the selected target in the testing data.

Parameters
target_indexIndex of the target to evaluate.

Definition at line 5468 of file data_set.cpp.

◆ calculate_testing_target_variables_descriptives()

Tensor< Descriptives, 1 > calculate_testing_target_variables_descriptives ( ) const

Definition at line 5755 of file data_set.cpp.

◆ calculate_training_negatives()

Index calculate_training_negatives ( const Index &  target_index) const

Counts the number of negatives of the selected target in the training data.

Parameters
target_indexIndex of the target to evaluate.

Definition at line 5398 of file data_set.cpp.

◆ calculate_tree_path()

type calculate_tree_path ( const Tensor< type, 2 > &  tree,
const Index &  sample_index,
const Index &  tree_depth 
) const
private

Definition at line 9227 of file data_set.cpp.

◆ calculate_Tukey_outliers()

Tensor< Tensor< Index, 1 >, 1 > calculate_Tukey_outliers ( const type &  cleaning_parameter = type(1.5)) const

Calculate the outliers from the data set using the Tukey's test.

Parameters
cleaning_parameterParameter used to detect outliers.

Definition at line 8540 of file data_set.cpp.

◆ calculate_used_negatives()

Index calculate_used_negatives ( const Index &  target_index) const

Counts the number of used negatives of the selected target.

Parameters
target_indexIndex of the target to evaluate.

Definition at line 5363 of file data_set.cpp.

◆ calculate_used_targets_mean()

Tensor< type, 1 > calculate_used_targets_mean ( ) const

Definition at line 5831 of file data_set.cpp.

◆ calculate_used_variables_descriptives()

Tensor< Descriptives, 1 > calculate_used_variables_descriptives ( ) const

Returns a vector of vectors containing some basic descriptives of the used variables and samples The size of this vector is four. The subvectors are:

  • Minimum.
  • Maximum.
  • Mean.
  • Standard deviation.

Definition at line 5514 of file data_set.cpp.

◆ calculate_used_variables_minimums()

Tensor< type, 1 > calculate_used_variables_minimums ( ) const

Returns a vector containing the maximum of the used variables.

Definition at line 5800 of file data_set.cpp.

◆ calculate_variables_descriptives()

Tensor< Descriptives, 1 > calculate_variables_descriptives ( ) const

Returns a vector of vectors containing some basic descriptives of all the variables in the data set. The size of this vector is four. The subvectors are:

  • Minimum.
  • Maximum.
  • Mean.
  • Standard deviation.

Definition at line 5499 of file data_set.cpp.

◆ calculate_variables_means()

Tensor< type, 1 > calculate_variables_means ( const Tensor< Index, 1 > &  variables_indices) const

Returns a vector containing the means of a set of given variables.

Parameters
variables_indicesIndices of the variables.

Definition at line 5809 of file data_set.cpp.

◆ check_constant_columns()

void check_constant_columns ( )

Definition at line 3566 of file data_set.cpp.

◆ check_input_csv()

void check_input_csv ( const string &  input_data_file_name,
const char &  separator_char 
) const

This method checks if the input data file has the correct format. Returns an error message.

Definition at line 8111 of file data_set.cpp.

◆ check_separators()

void check_separators ( const string &  line) const

Definition at line 10571 of file data_set.cpp.

◆ check_special_characters()

void check_special_characters ( const string &  line) const

Definition at line 10646 of file data_set.cpp.

◆ count_nan()

Index count_nan ( ) const

Definition at line 10783 of file data_set.cpp.

◆ count_nan_columns()

Tensor< Index, 1 > count_nan_columns ( ) const

Definition at line 10729 of file data_set.cpp.

◆ count_rows_with_nan()

Index count_rows_with_nan ( ) const

Definition at line 10754 of file data_set.cpp.

◆ create_bounding_limits_kd_tree()

Tensor< Tensor< Index, 1 >, 1 > create_bounding_limits_kd_tree ( const Index &  depth) const
private

Definition at line 8816 of file data_set.cpp.

◆ create_isolation_forest()

Tensor< Tensor< type, 2 >, 1 > create_isolation_forest ( const Index &  trees_number,
const Index &  sub_set_size,
const Index &  max_depth 
) const
private

Definition at line 9202 of file data_set.cpp.

◆ create_isolation_tree()

Tensor< type, 2 > create_isolation_tree ( const Tensor< Index, 1 > &  indices,
const Index &  max_depth 
) const
private

Definition at line 9140 of file data_set.cpp.

◆ create_kd_tree()

void create_kd_tree ( Tensor< Tensor< type, 1 >, 1 > &  tree,
const Tensor< Tensor< Index, 1 >, 1 > &  bounding_limits 
) const
private

Definition at line 8843 of file data_set.cpp.

◆ filter_data()

Tensor< Index, 1 > filter_data ( const Tensor< type, 1 > &  minimums,
const Tensor< type, 1 > &  maximums 
)

Unuses those samples with values outside a defined range.

Parameters
minimumsvector of minimum values in the range. The size must be equal to the number of variables.
maximumsvector of maximum values in the range. The size must be equal to the number of variables.
Todo:

Definition at line 9662 of file data_set.cpp.

◆ fix_repeated_names()

void fix_repeated_names ( )

Definition at line 10840 of file data_set.cpp.

◆ from_XML()

void from_XML ( const tinyxml2::XMLDocument data_set_document)

Definition at line 6855 of file data_set.cpp.

◆ generate_constant_data()

void generate_constant_data ( const Index &  samples_number,
const Index &  variables_number,
const type &  value 
)

Generates an artificial data_set with a given number of samples and number of variables by constant data.

Parameters
samples_numberNumber of samples in the data_set.
variables_numberNumber of variables in the data_set.

Definition at line 9558 of file data_set.cpp.

◆ generate_random_data()

void generate_random_data ( const Index &  samples_number,
const Index &  variables_number 
)

Generates an artificial data_set with a given number of samples and number of variables using random data.

Parameters
samples_numberNumber of samples in the data_set.
variables_numberNumber of variables in the data_set.
Todo:

Definition at line 9574 of file data_set.cpp.

◆ generate_Rosenbrock_data()

void generate_Rosenbrock_data ( const Index &  samples_number,
const Index &  variables_number 
)

Generates an artificial data_set with a given number of samples and number of variables using the Rosenbrock function.

Parameters
samples_numberNumber of samples in the data_set.
variables_numberNumber of variables in the data_set.
Todo:

Definition at line 9607 of file data_set.cpp.

◆ generate_sequential_data()

void generate_sequential_data ( const Index &  samples_number,
const Index &  variables_number 
)

Generates an artificial data_set with a given number of samples and number of variables using a sequential data.

Parameters
samples_numberNumber of samples in the data_set.
variables_numberNumber of variables in the data_set.

Definition at line 9587 of file data_set.cpp.

◆ generate_sum_data()

void generate_sum_data ( const Index &  samples_number,
const Index &  variables_number 
)

Definition at line 9637 of file data_set.cpp.

◆ get_batches()

Tensor< Index, 2 > get_batches ( const Tensor< Index, 1 > &  samples_indices,
const Index &  batch_samples_number,
const bool &  shuffle,
const Index &  new_buffer_size = 100 
) const

Returns a vector, where each element is a vector that contains the indices of the different batches of the training samples.

Parameters
shuffleIs a boleean. If shuffle is true, then the indices are shuffled into batches, and false otherwise

Definition at line 1217 of file data_set.cpp.

◆ get_column_data() [1/3]

Tensor< type, 2 > get_column_data ( const Index &  column_index) const

Returns the data from the data set column with a given index, these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).

Parameters
column_indexIndex of the column.

Definition at line 4289 of file data_set.cpp.

◆ get_column_data() [2/3]

Tensor< type, 2 > get_column_data ( const Index &  column_index,
const Tensor< Index, 1 > &  rows_indices 
) const

Returns the data from the data set column with a given index, these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).

Parameters
column_indexIndex of the column.
rows_indicesRows of the indices.

Definition at line 4332 of file data_set.cpp.

◆ get_column_data() [3/3]

Tensor< type, 2 > get_column_data ( const string &  column_name) const

Returns the data from the data set column with a given name, these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).

Parameters
column_nameName of the column.

Definition at line 4342 of file data_set.cpp.

◆ get_column_index() [1/2]

Index get_column_index ( const Index &  variable_index) const

Returns the index of the column to which a variable index belongs.

Parameters
variable_indexIndex of the variable to be found.

Definition at line 4214 of file data_set.cpp.

◆ get_column_index() [2/2]

Index get_column_index ( const string &  column_name) const

Returns the index of the column with the given name.

Parameters
column_nameName of the column to be found.

Definition at line 4192 of file data_set.cpp.

◆ get_column_type()

ColumnType get_column_type ( const Index &  index) const
inline

Definition at line 244 of file data_set.h.

◆ get_column_use()

DataSet::VariableUse get_column_use ( const Index &  index) const

Returns a vector containing the use of the column, without taking into account the categories.

Definition at line 2000 of file data_set.cpp.

◆ get_columns()

Tensor< DataSet::Column, 1 > get_columns ( ) const

Returns the columns of the data set.

Definition at line 2697 of file data_set.cpp.

◆ get_columns_names()

Tensor< string, 1 > get_columns_names ( ) const

Returns a string vector that contains the names of the columns.

Definition at line 2473 of file data_set.cpp.

◆ get_columns_number()

Index get_columns_number ( ) const

Returns the number of columns in the data set.

Definition at line 2800 of file data_set.cpp.

◆ get_columns_scalers()

Tensor< Scaler, 1 > get_columns_scalers ( ) const

Definition at line 2408 of file data_set.cpp.

◆ get_columns_uses()

Tensor< DataSet::VariableUse, 1 > get_columns_uses ( ) const

Returns the uses of each columns of the data set.

Definition at line 2008 of file data_set.cpp.

◆ get_data()

const Tensor< type, 2 > & get_data ( ) const

Returns a reference to the data matrix in the data set. The number of rows is equal to the number of samples. The number of columns is equal to the number of variables.

Definition at line 3673 of file data_set.cpp.

◆ get_data_file_name()

const string & get_data_file_name ( ) const

Returns the name of the data file.

Definition at line 3704 of file data_set.cpp.

◆ get_data_file_preview()

Tensor< Tensor< string, 1 >, 1 > get_data_file_preview ( ) const

Definition at line 4550 of file data_set.cpp.

◆ get_data_pointer()

Tensor< type, 2 > * get_data_pointer ( )

Definition at line 3679 of file data_set.cpp.

◆ get_default_columns_names()

Tensor< string, 1 > get_default_columns_names ( const Index &  columns_number)
static

Definition at line 9873 of file data_set.cpp.

◆ get_display()

const bool & get_display ( ) const

Returns true if messages from this class can be displayed on the screen, or false if messages from this class can't be displayed on the screen.

Definition at line 94 of file data_set.cpp.

◆ get_gmt()

Index get_gmt ( ) const

Returns the value of the gmt that has the data set, by default it is 0. This is recommended to use in forecasting problems.

Definition at line 5856 of file data_set.cpp.

◆ get_has_rows_labels()

bool get_has_rows_labels ( ) const

Definition at line 11149 of file data_set.cpp.

◆ get_header_line()

const bool & get_header_line ( ) const

Returns true if the first line of the data file has a header with the names of the variables, and false otherwise.

Definition at line 3712 of file data_set.cpp.

◆ get_input_columns()

Tensor< DataSet::Column, 1 > get_input_columns ( ) const

Returns the input columns of the data set.

Definition at line 2717 of file data_set.cpp.

◆ get_input_columns_binary()

Tensor< bool, 1 > get_input_columns_binary ( ) const

Returns the input columns of the data set.

Definition at line 2739 of file data_set.cpp.

◆ get_input_columns_indices()

Tensor< Index, 1 > get_input_columns_indices ( ) const

Returns a indices vector with the positions of the inputs.

Definition at line 2271 of file data_set.cpp.

◆ get_input_columns_names()

Tensor< string, 1 > get_input_columns_names ( ) const

Returns a string vector that contains the names of the columns whose uses are Input.

Definition at line 2505 of file data_set.cpp.

◆ get_input_columns_number()

Index get_input_columns_number ( ) const

Returns the number of columns whose uses are Input.

Definition at line 2575 of file data_set.cpp.

◆ get_input_data() [1/2]

Tensor< type, 2 > get_input_data ( ) const

Returns a matrix with the input variables in the data set. The number of rows is the number of The number of columns is the number of input variables.

Definition at line 3955 of file data_set.cpp.

◆ get_input_data() [2/2]

Tensor< type, 2 > get_input_data ( const Tensor< Index, 1 > &  samples_indices) const

Returns a tensor with the input variables in the data set. The number of rows is the number of The number of columns is the number of input variables.

Definition at line 3986 of file data_set.cpp.

◆ get_input_time_series_columns_indices()

Tensor< Index, 1 > get_input_time_series_columns_indices ( ) const

Definition at line 2292 of file data_set.cpp.

◆ get_input_time_series_columns_number()

Index get_input_time_series_columns_number ( ) const

Definition at line 2591 of file data_set.cpp.

◆ get_input_variables_dimensions()

const Tensor< Index, 1 > & get_input_variables_dimensions ( ) const

Returns the dimensions of the input variables.

Definition at line 2245 of file data_set.cpp.

◆ get_input_variables_indices()

Tensor< Index, 1 > get_input_variables_indices ( ) const

Returns the indices of the input variables.

Definition at line 3047 of file data_set.cpp.

◆ get_input_variables_names()

Tensor< string, 1 > get_input_variables_names ( ) const

Returns the names of the input variables in the data set. The size of the vector is the number of input variables.

Definition at line 2184 of file data_set.cpp.

◆ get_input_variables_number()

Index get_input_variables_number ( ) const

Returns the number of input variables of the data set. Note that the number of variables does not have to equal the number of columns in the data set, because OpenNN recognizes the categorical columns, separating these categories into variables of the data set.

Definition at line 2859 of file data_set.cpp.

◆ get_input_variables_rank()

Index get_input_variables_rank ( ) const

Definition at line 2251 of file data_set.cpp.

◆ get_input_variables_scalers()

Tensor< Scaler, 1 > get_input_variables_scalers ( ) const

Definition at line 2423 of file data_set.cpp.

◆ get_kd_tree_data()

Tensor< Tensor< type, 1 >, 1 > get_kd_tree_data ( ) const
private

Definition at line 8792 of file data_set.cpp.

◆ get_lags_number()

const Index & get_lags_number ( ) const

Returns the number of lags to be used in a time series prediction application.

Definition at line 3825 of file data_set.cpp.

◆ get_missing_values_label()

const string & get_missing_values_label ( ) const

Returns the string which will be used as label for the missing values in the data file.

Definition at line 3817 of file data_set.cpp.

◆ get_missing_values_method()

DataSet::MissingValuesMethod get_missing_values_method ( ) const

Returns a string with the method used.

Definition at line 3696 of file data_set.cpp.

◆ get_rows_label()

const bool & get_rows_label ( ) const

Returns true if the data file has rows label, and false otherwise.

Definition at line 3720 of file data_set.cpp.

◆ get_rows_label_tensor()

Tensor< string, 1 > get_rows_label_tensor ( ) const

Definition at line 3726 of file data_set.cpp.

◆ get_sample_data() [1/2]

Tensor< type, 1 > get_sample_data ( const Index &  index) const

Returns the inputs and target values of a single sample in the data set.

Parameters
indexIndex of the sample.

Definition at line 4093 of file data_set.cpp.

◆ get_sample_data() [2/2]

Tensor< type, 1 > get_sample_data ( const Index &  sample_index,
const Tensor< Index, 1 > &  variables_indices 
) const

Returns the inputs and target values of a single sample in the data set.

Parameters
sample_indexIndex of the sample.
variables_indicesIndices of the variables.

Definition at line 4123 of file data_set.cpp.

◆ get_sample_input_data()

Tensor< type, 2 > get_sample_input_data ( const Index &  sample_index) const

Returns the inputs values of a single sample in the data set.

Parameters
sample_indexIndex of the sample.

Definition at line 4163 of file data_set.cpp.

◆ get_sample_string()

string get_sample_string ( const Index &  sample_index,
const string &  separator = "," 
) const

Returns a string with the values of the sample corresponding to the given index. The values will be separated by the given separator char.

Parameters
sample_indexIndex of the sample.
separatorSeparator.

Definition at line 1006 of file data_set.cpp.

◆ get_sample_target_data()

Tensor< type, 2 > get_sample_target_data ( const Index &  sample_index) const

Returns the target values of a single sample in the data set.

Parameters
sample_indexIndex of the sample.

Definition at line 4181 of file data_set.cpp.

◆ get_sample_use()

DataSet::SampleUse get_sample_use ( const Index &  index) const

Returns the use of a single sample.

Parameters
indexSample index.

Definition at line 1199 of file data_set.cpp.

◆ get_samples_number()

Index get_samples_number ( ) const
inline

Definition at line 184 of file data_set.h.

◆ get_samples_uses()

const Tensor< DataSet::SampleUse, 1 > & get_samples_uses ( ) const

Returns the use of every sample (training, selection, testing or unused) in a vector.

Definition at line 1207 of file data_set.cpp.

◆ get_samples_uses_numbers()

Tensor< Index, 1 > get_samples_uses_numbers ( ) const

Returns a vector with the number of training, selection, testing and unused samples. The size of that vector is therefore four.

Definition at line 943 of file data_set.cpp.

◆ get_samples_uses_percentages()

Tensor< type, 1 > get_samples_uses_percentages ( ) const

Returns a vector with the uses of the samples in percentages of the data set. Uses: training, selection, testing and unused samples. Note that the vector size is four.

Definition at line 977 of file data_set.cpp.

◆ get_scaling_unscaling_method()

Scaler get_scaling_unscaling_method ( const string &  scaling_unscaling_method)
static

Returns a value of the scaling-unscaling method enumeration from a string containing the name of that method.

Parameters
scaling_unscaling_methodString with the name of the scaling and unscaling method.

Definition at line 3861 of file data_set.cpp.

◆ get_selection_data()

Tensor< type, 2 > get_selection_data ( ) const

Returns a matrix with the selection samples in the data set. The number of rows is the number of selection The number of columns is the number of variables.

Definition at line 3921 of file data_set.cpp.

◆ get_selection_input_data()

Tensor< type, 2 > get_selection_input_data ( ) const

Returns a tensor with selection samples and input variables. The number of rows is the number of selection The number of columns is the number of input variables.

Definition at line 4038 of file data_set.cpp.

◆ get_selection_rows_label_tensor()

Tensor< string, 1 > get_selection_rows_label_tensor ( )

Definition at line 3746 of file data_set.cpp.

◆ get_selection_samples_indices()

Tensor< Index, 1 > get_selection_samples_indices ( ) const

Returns the indices of the samples which will be used for selection.

Definition at line 1098 of file data_set.cpp.

◆ get_selection_samples_number()

Index get_selection_samples_number ( ) const

Returns the number of samples in the data set which will be used for selection.

Definition at line 1402 of file data_set.cpp.

◆ get_selection_target_data()

Tensor< type, 2 > get_selection_target_data ( ) const

Returns a tensor with selection samples and target variables. The number of rows is the number of selection The number of columns is the number of target variables.

Definition at line 4052 of file data_set.cpp.

◆ get_separator()

const DataSet::Separator & get_separator ( ) const

Returns the separator to be used in the data file.

Definition at line 3763 of file data_set.cpp.

◆ get_separator_char()

char get_separator_char ( ) const

Returns the string which will be used as separator in the data file.

Definition at line 3771 of file data_set.cpp.

◆ get_separator_string()

string get_separator_string ( ) const

Returns the string which will be used as separator in the data file.

Definition at line 3794 of file data_set.cpp.

◆ get_steps_ahead()

const Index & get_steps_ahead ( ) const

Returns the number of steps ahead to be used in a time series prediction application.

Definition at line 3833 of file data_set.cpp.

◆ get_subtensor_data()

Tensor< type, 2 > get_subtensor_data ( const Tensor< Index, 1 > &  rows_indices,
const Tensor< Index, 1 > &  variables_indices 
) const

Definition at line 4556 of file data_set.cpp.

◆ get_target_columns()

Tensor< DataSet::Column, 1 > get_target_columns ( ) const

Returns the target columns of the data set.

Definition at line 2759 of file data_set.cpp.

◆ get_target_columns_indices()

Tensor< Index, 1 > get_target_columns_indices ( ) const

Returns a indices vector with the positions of the targets.

Definition at line 2315 of file data_set.cpp.

◆ get_target_columns_names()

Tensor< string, 1 > get_target_columns_names ( ) const

Returns a string vector which contains the names of the columns whose uses are Target.

Definition at line 2528 of file data_set.cpp.

◆ get_target_columns_number()

Index get_target_columns_number ( ) const

Returns the number of columns whose uses are Target.

Definition at line 2609 of file data_set.cpp.

◆ get_target_data() [1/2]

Tensor< type, 2 > get_target_data ( ) const

Returns a matrix with the target variables in the data set. The number of rows is the number of The number of columns is the number of target variables.

Definition at line 3972 of file data_set.cpp.

◆ get_target_data() [2/2]

Tensor< type, 2 > get_target_data ( const Tensor< Index, 1 > &  samples_indices) const

Returns a tensor with the target variables in the data set. The number of rows is the number of The number of columns is the number of input variables.

Definition at line 3998 of file data_set.cpp.

◆ get_target_time_series_columns_indices()

Tensor< Index, 1 > get_target_time_series_columns_indices ( ) const

Definition at line 2336 of file data_set.cpp.

◆ get_target_time_series_columns_number()

Index get_target_time_series_columns_number ( ) const

Definition at line 2625 of file data_set.cpp.

◆ get_target_variables_indices()

Tensor< Index, 1 > get_target_variables_indices ( ) const

Returns the indices of the target variables.

Definition at line 3094 of file data_set.cpp.

◆ get_target_variables_names()

Tensor< string, 1 > get_target_variables_names ( ) const

Returns the names of the target variables in the data set. The size of the vector is the number of target variables.

Definition at line 2215 of file data_set.cpp.

◆ get_target_variables_number()

Index get_target_variables_number ( ) const

Returns the number of target variables of the data set.

Definition at line 2884 of file data_set.cpp.

◆ get_target_variables_scalers()

Tensor< Scaler, 1 > get_target_variables_scalers ( ) const

Definition at line 2447 of file data_set.cpp.

◆ get_testing_data()

Tensor< type, 2 > get_testing_data ( ) const

Returns a matrix with the testing samples in the data set. The number of rows is the number of testing The number of columns is the number of variables.

Definition at line 3938 of file data_set.cpp.

◆ get_testing_input_data()

Tensor< type, 2 > get_testing_input_data ( ) const

Returns a tensor with testing samples and input variables. The number of rows is the number of testing The number of columns is the number of input variables.

Definition at line 4066 of file data_set.cpp.

◆ get_testing_rows_label_tensor()

Tensor< string, 1 > get_testing_rows_label_tensor ( )

Definition at line 3731 of file data_set.cpp.

◆ get_testing_samples_indices()

Tensor< Index, 1 > get_testing_samples_indices ( ) const

Returns the indices of the samples which will be used for testing.

Definition at line 1123 of file data_set.cpp.

◆ get_testing_samples_number()

Index get_testing_samples_number ( ) const

Returns the number of samples in the data set which will be used for testing.

Definition at line 1422 of file data_set.cpp.

◆ get_testing_target_data()

Tensor< type, 2 > get_testing_target_data ( ) const

Returns a tensor with testing samples and target variables. The number of rows is the number of testing The number of columns is the number of target variables.

Definition at line 4080 of file data_set.cpp.

◆ get_time_column()

const string & get_time_column ( ) const

Returns the indices of the time variables in the data set.

Definition at line 3841 of file data_set.cpp.

◆ get_time_columns_number()

Index get_time_columns_number ( ) const

Returns the number of columns whose uses are Time.

Definition at line 2643 of file data_set.cpp.

◆ get_time_series_column_data()

Tensor< type, 2 > get_time_series_column_data ( const Index &  column_index) const

Returns the data from the time series column with a given index,

Parameters
column_indexIndex of the column.

Definition at line 4309 of file data_set.cpp.

◆ get_time_series_columns()

Tensor< DataSet::Column, 1 > get_time_series_columns ( ) const

Definition at line 2703 of file data_set.cpp.

◆ get_time_series_columns_names()

Tensor< string, 1 > get_time_series_columns_names ( ) const

Definition at line 2488 of file data_set.cpp.

◆ get_time_series_columns_number()

Index get_time_series_columns_number ( ) const

Returns the number of columns in the time series.

Definition at line 2807 of file data_set.cpp.

◆ get_time_series_data()

const Tensor< type, 2 > & get_time_series_data ( ) const

Returns a reference to the time series data matrix in the data set. Only for time series problems.

Definition at line 3688 of file data_set.cpp.

◆ get_time_series_data_rows_number()

Index get_time_series_data_rows_number ( ) const

Definition at line 2709 of file data_set.cpp.

◆ get_time_series_time_column_index()

Index get_time_series_time_column_index ( ) const

Definition at line 3847 of file data_set.cpp.

◆ get_time_series_variables_names()

Tensor< string, 1 > get_time_series_variables_names ( ) const

Returns a string vector with the names of all the variables in the time series data. The size of the vector is the number of variables.

Definition at line 2151 of file data_set.cpp.

◆ get_time_series_variables_number()

Index get_time_series_variables_number ( ) const

Returns the number of variables in the time series data.

Definition at line 2835 of file data_set.cpp.

◆ get_training_data()

Tensor< type, 2 > get_training_data ( ) const

Returns a matrix with the training samples in the data set. The number of rows is the number of training The number of columns is the number of variables.

Definition at line 3900 of file data_set.cpp.

◆ get_training_input_data()

Tensor< type, 2 > get_training_input_data ( ) const

Returns a matrix with training samples and input variables. The number of rows is the number of training The number of columns is the number of input variables.

Definition at line 4010 of file data_set.cpp.

◆ get_training_samples_indices()

Tensor< Index, 1 > get_training_samples_indices ( ) const

Returns the indices of the samples which will be used for training.

Definition at line 1073 of file data_set.cpp.

◆ get_training_samples_number()

Index get_training_samples_number ( ) const

Returns the number of samples in the data set which will be used for training.

Definition at line 1382 of file data_set.cpp.

◆ get_training_target_data()

Tensor< type, 2 > get_training_target_data ( ) const

Returns a tensor with training samples and target variables. The number of rows is the number of training The number of columns is the number of target variables.

Definition at line 4024 of file data_set.cpp.

◆ get_unused_columns_indices()

Tensor< Index, 1 > get_unused_columns_indices ( ) const

Returns a indices vector with the positions of the unused columns.

Definition at line 2359 of file data_set.cpp.

◆ get_unused_columns_number()

Index get_unused_columns_number ( ) const

Returns the number of columns that are not used.

Definition at line 2661 of file data_set.cpp.

◆ get_unused_samples_indices()

Tensor< Index, 1 > get_unused_samples_indices ( ) const

Returns the indices of the samples set unused.

Definition at line 1173 of file data_set.cpp.

◆ get_unused_samples_number()

Index get_unused_samples_number ( ) const

Returns the number of samples in the data set which will neither be used for training, selection or testing.

Definition at line 1455 of file data_set.cpp.

◆ get_unused_variables_indices()

Tensor< Index, 1 > get_unused_variables_indices ( ) const

Returns the indices of the unused variables.

Definition at line 2956 of file data_set.cpp.

◆ get_unused_variables_number()

Index get_unused_variables_number ( ) const

Returns the number of variables which will neither be used as input nor as target.

Definition at line 2910 of file data_set.cpp.

◆ get_used_columns()

Tensor< DataSet::Column, 1 > get_used_columns ( ) const

Returns the used columns of the data set.

Definition at line 2781 of file data_set.cpp.

◆ get_used_columns_indices()

Tensor< Index, 1 > get_used_columns_indices ( ) const

Returns a indices vector with the positions of the used columns.

Definition at line 2383 of file data_set.cpp.

◆ get_used_columns_names()

Tensor< string, 1 > get_used_columns_names ( ) const

Returns a string vector which contains the names of the columns used whether Input, Target or Time.

Definition at line 2551 of file data_set.cpp.

◆ get_used_columns_number()

Index get_used_columns_number ( ) const

Returns the number of columns that are used.

Definition at line 2679 of file data_set.cpp.

◆ get_used_samples_indices()

Tensor< Index, 1 > get_used_samples_indices ( ) const

Returns the indices of the used samples(those which are not set unused).

Definition at line 1148 of file data_set.cpp.

◆ get_used_samples_number()

Index get_used_samples_number ( ) const

Returns the total number of training, selection and testing samples, i.e. those which are not "Unused".

Definition at line 1443 of file data_set.cpp.

◆ get_used_variables_indices()

Tensor< Index, 1 > get_used_variables_indices ( ) const

Returns the indices of the used variables.

Definition at line 3002 of file data_set.cpp.

◆ get_used_variables_number()

Index get_used_variables_number ( ) const

Returns the number of variables which are either input nor target.

Definition at line 2259 of file data_set.cpp.

◆ get_variable_data() [1/4]

Tensor< type, 1 > get_variable_data ( const Index &  index) const

Returns all the samples of a single variable in the data set.

Parameters
indexIndex of the variable.

Definition at line 4353 of file data_set.cpp.

◆ get_variable_data() [2/4]

Tensor< type, 1 > get_variable_data ( const Index &  variable_index,
const Tensor< Index, 1 > &  samples_indices 
) const

Returns a given set of samples of a single variable in the data set.

Parameters
variable_indexIndex of the variable.
samples_indicesIndices of the

Definition at line 4442 of file data_set.cpp.

◆ get_variable_data() [3/4]

Tensor< type, 1 > get_variable_data ( const string &  variable_name) const

Returns all the samples of a single variable in the data set.

Parameters
variable_nameName of the variable.

Definition at line 4380 of file data_set.cpp.

◆ get_variable_data() [4/4]

Tensor< type, 1 > get_variable_data ( const string &  variable_name,
const Tensor< Index, 1 > &  samples_indices 
) const

Returns a given set of samples of a single variable in the data set.

Parameters
variable_nameName of the variable.
samples_indicesIndices of the

Definition at line 4481 of file data_set.cpp.

◆ get_variable_index()

Index get_variable_index ( const string &  name) const

Returns a variable index in the data set with given name.

Parameters
nameName of variable.

Definition at line 2937 of file data_set.cpp.

◆ get_variable_indices()

Tensor< Index, 1 > get_variable_indices ( const Index &  column_index) const

Returns the indices of a variable in the data set. Note that the number of variables does not have to equal the number of columns in the data set, because OpenNN recognizes the categorical columns, separating these categories into variables of the data set.

Definition at line 4248 of file data_set.cpp.

◆ get_variable_name()

string get_variable_name ( const Index &  variable_index) const

Returns the name of a single variable in the data set.

Parameters
indexIndex of variable.

Definition at line 2059 of file data_set.cpp.

◆ get_variable_use()

DataSet::VariableUse get_variable_use ( const Index &  index) const

Returns the use of a single variable.

Parameters
indexIndex of variable.

Definition at line 1992 of file data_set.cpp.

◆ get_variables_names()

Tensor< string, 1 > get_variables_names ( ) const

Returns a string vector with the names of all the variables in the data set. The size of the vector is the number of variables.

Definition at line 2118 of file data_set.cpp.

◆ get_variables_number()

Index get_variables_number ( ) const

Returns the number of variables in the data set.

Definition at line 2814 of file data_set.cpp.

◆ get_variables_uses()

Tensor< DataSet::VariableUse, 1 > get_variables_uses ( ) const

Returns a vector containing the use of each column, including the categories. The size of the vector is equal to the number of variables.

Definition at line 2026 of file data_set.cpp.

◆ has_binary_columns()

bool has_binary_columns ( ) const

Definition at line 10668 of file data_set.cpp.

◆ has_categorical_columns()

bool has_categorical_columns ( ) const

Definition at line 10681 of file data_set.cpp.

◆ has_nan()

bool has_nan ( ) const

Returns true if the data contain missing values.

Definition at line 5912 of file data_set.cpp.

◆ has_nan_row()

bool has_nan_row ( const Index &  row_index) const

Returns true if the given row contains missing values.

Definition at line 5922 of file data_set.cpp.

◆ has_selection()

bool has_selection ( ) const

Definition at line 10721 of file data_set.cpp.

◆ has_time_columns()

bool has_time_columns ( ) const

Definition at line 10694 of file data_set.cpp.

◆ has_time_time_series_columns()

bool has_time_time_series_columns ( ) const

Definition at line 10707 of file data_set.cpp.

◆ impute_missing_values_mean()

void impute_missing_values_mean ( )

Substitutes all the missing values by the mean of the corresponding variable.

Definition at line 9768 of file data_set.cpp.

◆ impute_missing_values_median()

void impute_missing_values_median ( )

Substitutes all the missing values by the median of the corresponding variable.

Definition at line 9802 of file data_set.cpp.

◆ impute_missing_values_unuse()

void impute_missing_values_unuse ( )

Sets all the samples with missing values to "Unused".

Definition at line 9753 of file data_set.cpp.

◆ initialize_sequential()

void initialize_sequential ( Tensor< Index, 1 > &  new_tensor,
const Index &  start,
const Index &  step,
const Index &  end 
) const

Definition at line 10947 of file data_set.cpp.

◆ intialize_sequential()

void intialize_sequential ( Tensor< type, 1 > &  new_tensor,
const type &  start,
const type &  step,
const type &  end 
) const

Definition at line 10964 of file data_set.cpp.

◆ is_empty()

bool is_empty ( ) const

Returns true if the data matrix is empty, and false otherwise.

Definition at line 3658 of file data_set.cpp.

◆ is_sample_unused()

bool is_sample_unused ( const Index &  index) const

Returns true if a given sample is to be unused and false in other case.

Parameters
indexSample index.

Definition at line 926 of file data_set.cpp.

◆ is_sample_used()

bool is_sample_used ( const Index &  index) const

Returns true if a given sample is to be used for training, selection or testing, and false if it is to be unused.

Parameters
indexSample index.

Definition at line 910 of file data_set.cpp.

◆ load()

void load ( const string &  file_name)

Loads the members of a data set object from a XML-type file:

  • Samples number.
  • Training samples number.
  • Training samples indices.
  • Selection samples number.
  • Selection samples indices.
  • Testing samples number.
  • Testing samples indices.
  • Input variables number.
  • Input variables indices.
  • Target variables number.
  • Target variables indices.
  • Input variables name.
  • Target variables name.
  • Input variables description.
  • Target variables description.
  • Display.
  • Data.

Please mind about the file format. This is specified in the User's Guide.

Parameters
file_nameName of data set XML-type file.

Definition at line 7716 of file data_set.cpp.

◆ load_data_binary()

void load_data_binary ( )

This method loads the data from a binary data file.

Definition at line 8023 of file data_set.cpp.

◆ load_time_series_data_binary()

void load_time_series_data_binary ( const string &  time_series_data_file_name)

This method loads time series data from a binary data.

Definition at line 8067 of file data_set.cpp.

◆ print()

void print ( ) const

Prints to the screen in text format the main numbers from the data set object.

Definition at line 7664 of file data_set.cpp.

◆ print_columns()

void print_columns ( ) const

Definition at line 7735 of file data_set.cpp.

◆ print_columns_types()

void print_columns_types ( ) const

Definition at line 7749 of file data_set.cpp.

◆ print_columns_uses()

void print_columns_uses ( ) const

Definition at line 7766 of file data_set.cpp.

◆ print_data()

void print_data ( ) const

Prints to the screen the values of the data matrix.

Definition at line 7783 of file data_set.cpp.

◆ print_data_file_preview()

void print_data_file_preview ( ) const

Definition at line 6072 of file data_set.cpp.

◆ print_data_preview()

void print_data_preview ( ) const

Prints to the scross_entropy_errorn a preview of the data matrix, i.e., the first, second and last samples

Definition at line 7792 of file data_set.cpp.

◆ print_input_target_columns_correlations()

void print_input_target_columns_correlations ( ) const

Print on screen the correlation between targets and inputs.

Definition at line 5960 of file data_set.cpp.

◆ print_inputs_correlations()

void print_inputs_correlations ( ) const

Print on screen the correlation between variables in the data set.

Definition at line 6064 of file data_set.cpp.

◆ print_missing_values_information()

void print_missing_values_information ( ) const

Print on screen the information about the missing values in the data set.

  • Total number of missing values.
  • Number of variables with missing values.
  • Number of samples with missing values.

Definition at line 5940 of file data_set.cpp.

◆ print_top_input_target_columns_correlations()

void print_top_input_target_columns_correlations ( ) const

This method print on screen the corretaliont between inputs and targets.

Parameters
numberNumber of variables to be printed.

Definition at line 5983 of file data_set.cpp.

◆ print_top_inputs_correlations()

void print_top_inputs_correlations ( ) const

This method print on screen the corretaliont between variables.

Parameters
numberNumber of variables to be printed.

Definition at line 6091 of file data_set.cpp.

◆ push_back() [1/2]

Tensor< Index, 1 > push_back ( const Tensor< Index, 1 > &  old_vector,
const Index &  new_string 
) const

Definition at line 10915 of file data_set.cpp.

◆ push_back() [2/2]

Tensor< string, 1 > push_back ( const Tensor< string, 1 > &  old_vector,
const string &  new_string 
) const

Definition at line 10931 of file data_set.cpp.

◆ read_csv()

void read_csv ( )

Definition at line 9853 of file data_set.cpp.

◆ read_csv_1()

void read_csv_1 ( )

Definition at line 9890 of file data_set.cpp.

◆ read_csv_2_complete()

void read_csv_2_complete ( )

Definition at line 10253 of file data_set.cpp.

◆ read_csv_2_simple()

void read_csv_2_simple ( )

Definition at line 10050 of file data_set.cpp.

◆ read_csv_3_complete()

void read_csv_3_complete ( )

Definition at line 10395 of file data_set.cpp.

◆ read_csv_3_simple()

void read_csv_3_simple ( )

Definition at line 10140 of file data_set.cpp.

◆ read_input_csv()

Tensor< type, 2 > read_input_csv ( const string &  input_data_file_name,
const char &  separator_char,
const string &  missing_values_label,
const bool &  has_columns_name,
const bool &  has_rows_label 
) const

This method loads data from a file and returns a matrix containing the input columns.

Definition at line 8182 of file data_set.cpp.

◆ save()

void save ( const string &  file_name) const

Saves the members of a data set object to a XML-type file in an XML-type format.

Parameters
file_nameName of data set XML-type file.

Definition at line 7681 of file data_set.cpp.

◆ save_data()

void save_data ( ) const

Saves to the data file the values of the data matrix.

Definition at line 7844 of file data_set.cpp.

◆ save_data_binary()

void save_data_binary ( const string &  binary_data_file_name) const

Saves to the data file the values of the data matrix in binary format.

Definition at line 7909 of file data_set.cpp.

◆ save_time_series_data_binary()

void save_time_series_data_binary ( const string &  binary_data_file_name) const

Saves to the data file the values of the time series data matrix in binary format.

Definition at line 7958 of file data_set.cpp.

◆ scale_data()

Tensor< Descriptives, 1 > scale_data ( )

Definition at line 6146 of file data_set.cpp.

◆ scale_input_variables()

Tensor< Descriptives, 1 > scale_input_variables ( )

It scales every input variable with the given method. The method to be used is that in the scaling and unscaling method variable.

Definition at line 6243 of file data_set.cpp.

◆ scale_target_variables()

Tensor< Descriptives, 1 > scale_target_variables ( )

Calculates the input and target variables descriptives. Then it scales the target variables with those values. The method to be used is that in the scaling and unscaling method variable. Finally, it returns the descriptives.

Definition at line 6298 of file data_set.cpp.

◆ scrub_missing_values()

void scrub_missing_values ( )

General method for dealing with missing values. It switches among the different scrubbing methods available, according to the corresponding value in the missing values object.

Definition at line 9828 of file data_set.cpp.

◆ select_outliers_via_contamination()

Tensor< Index, 1 > select_outliers_via_contamination ( const Tensor< type, 1 > &  outlier_ranks,
const type &  contamination = type(0.05),
bool  higher = true 
) const
private

Definition at line 8634 of file data_set.cpp.

◆ select_outliers_via_standard_deviation()

Tensor< Index, 1 > select_outliers_via_standard_deviation ( const Tensor< type, 1 > &  outlier_ranks,
const type &  deviation_factor = type(2.0),
bool  higher = true 
) const
private

Definition at line 8673 of file data_set.cpp.

◆ set() [1/8]

void set ( )

Sets zero samples and zero variables in the data set.

Definition at line 4586 of file data_set.cpp.

◆ set() [2/8]

void set ( const DataSet other_data_set)

Sets the members of this data set object with those from another data set object.

Parameters
other_data_setData set object to be copied.

Definition at line 4743 of file data_set.cpp.

◆ set() [3/8]

void set ( const Index &  new_samples_number,
const Index &  new_variables_number 
)

Sets new numbers of samples and variables in the inputs targets data set. All the samples are set for training. All the variables are set as inputs.

Parameters
new_samples_numberNumber of
new_variables_numberNumber of variables.

Definition at line 4649 of file data_set.cpp.

◆ set() [4/8]

void set ( const Index &  new_samples_number,
const Index &  new_inputs_number,
const Index &  new_targets_number 
)

Sets new numbers of samples and inputs and target variables in the data set. The variables in the data set are the number of inputs plus the number of targets.

Parameters
new_samples_numberNumber of
new_inputs_numberNumber of input variables.
new_targets_numberNumber of target variables.

Definition at line 4703 of file data_set.cpp.

◆ set() [5/8]

void set ( const string &  file_name)

Sets the data set members by loading them from a XML file.

Parameters
file_nameData set XML file_name.

Definition at line 4775 of file data_set.cpp.

◆ set() [6/8]

void set ( const string &  data_file_name,
const char &  separator,
const bool &  new_has_columns_names 
)

Definition at line 4606 of file data_set.cpp.

◆ set() [7/8]

void set ( const Tensor< type, 2 > &  new_data)

Sets all variables from a data matrix.

Parameters
new_dataData matrix.

Definition at line 4628 of file data_set.cpp.

◆ set() [8/8]

void set ( const tinyxml2::XMLDocument data_set_document)

Sets the data set members from a XML document.

Parameters
data_set_documentTinyXML document containing the member data.

Definition at line 4764 of file data_set.cpp.

◆ set_binary_simple_columns()

void set_binary_simple_columns ( )

Definition at line 3483 of file data_set.cpp.

◆ set_column_name()

void set_column_name ( const Index &  column_index,
const string &  new_name 
)

Sets the name of a single column.

Parameters
indexIndex of column.
new_useUse for that column.

Definition at line 1983 of file data_set.cpp.

◆ set_column_type() [1/2]

void set_column_type ( const Index &  index,
const ColumnType new_type 
)

Definition at line 3277 of file data_set.cpp.

◆ set_column_type() [2/2]

void set_column_type ( const string &  name,
const ColumnType new_type 
)

Definition at line 3283 of file data_set.cpp.

◆ set_column_use() [1/2]

void set_column_use ( const Index &  index,
const VariableUse new_use 
)

Sets the use of a single column.

Parameters
indexIndex of column.
new_useUse for that column.

Definition at line 3255 of file data_set.cpp.

◆ set_column_use() [2/2]

void set_column_use ( const string &  name,
const VariableUse new_use 
)

Sets the use of a single column.

Parameters
nameName of column.
new_useUse for that column.

Definition at line 3270 of file data_set.cpp.

◆ set_columns()

void set_columns ( const Tensor< Column, 1 > &  new_columns)

Definition at line 1912 of file data_set.cpp.

◆ set_columns_missing_values_number() [1/2]

void set_columns_missing_values_number ( )

Definition at line 10822 of file data_set.cpp.

◆ set_columns_missing_values_number() [2/2]

void set_columns_missing_values_number ( const Tensor< Index, 1 > &  new_columns_missing_values_number)

Definition at line 10816 of file data_set.cpp.

◆ set_columns_names()

void set_columns_names ( const Tensor< string, 1 > &  new_names)

Sets new names for the columns in the data set from a vector of strings. The size of that vector must be equal to the total number of variables.

Parameters
new_namesName of variables.

Definition at line 3403 of file data_set.cpp.

◆ set_columns_number()

void set_columns_number ( const Index &  new_columns_number)

Sets a new number of variables in the variables object. All variables are set as inputs but the last one, which is set as targets.

Parameters
new_columns_numberNumber of variables.

Definition at line 3465 of file data_set.cpp.

◆ set_columns_scalers()

void set_columns_scalers ( const Scaler &  scalers)

Definition at line 3473 of file data_set.cpp.

◆ set_columns_unused()

void set_columns_unused ( )

Sets all columns in the data_set as unused columns.

Definition at line 3200 of file data_set.cpp.

◆ set_columns_uses() [1/2]

void set_columns_uses ( const Tensor< string, 1 > &  new_columns_uses)

Sets the uses of the data set columns.

Parameters
new_columns_usesString vector that contains the new uses to be set, note that this vector needs to be the size of the number of columns in the data set.

Definition at line 3142 of file data_set.cpp.

◆ set_columns_uses() [2/2]

void set_columns_uses ( const Tensor< VariableUse, 1 > &  new_columns_uses)

Sets the uses of the data set columns.

Parameters
new_columns_usesDataSet::VariableUse vector that contains the new uses to be set, note that this vector needs to be the size of the number of columns in the data set.

Definition at line 3173 of file data_set.cpp.

◆ set_data()

void set_data ( const Tensor< type, 2 > &  new_data)

Sets a new data matrix. The number of rows must be equal to the number of The number of columns must be equal to the number of variables. Indices of all training, selection and testing samples and inputs and target variables do not change.

Parameters
new_dataData matrix.

Definition at line 4831 of file data_set.cpp.

◆ set_data_binary_random()

void set_data_binary_random ( )

Initializes the data matrix with random values chosen from a uniform distribution with given minimum and maximum. The targets will be binary randoms.

Definition at line 6458 of file data_set.cpp.

◆ set_data_constant()

void set_data_constant ( const type &  new_value)

Initializes the data matrix with a given value.

Parameters
new_valueInitialization value.

Definition at line 6440 of file data_set.cpp.

◆ set_data_file_name()

void set_data_file_name ( const string &  new_data_file_name)

Sets the name of the data file. It also loads the data from that file. Moreover, it sets the variables and samples objects.

Parameters
new_data_file_nameName of the file containing the data.

Definition at line 4858 of file data_set.cpp.

◆ set_data_random()

void set_data_random ( )

Initializes the data matrix with random values chosen from a uniform distribution with given minimum and maximum.

Definition at line 6449 of file data_set.cpp.

◆ set_default()

void set_default ( )

Sets the default member values:

  • Display: True.

Definition at line 4796 of file data_set.cpp.

◆ set_default_columns_names()

void set_default_columns_names ( )

This method puts the names of the columns in the data_set. This is used when the data_set does not have a header, the default names are: column_0, column_1, ..., column_n.

Definition at line 1968 of file data_set.cpp.

◆ set_default_columns_scalers()

void set_default_columns_scalers ( )

Returns a vector of strings containing the scaling method that best fits each of the input variables.

Todo:
Takes too long in big files.

Definition at line 6128 of file data_set.cpp.

◆ set_default_columns_uses()

void set_default_columns_uses ( )

This method sets the n columns of the data_set by default, i.e. until column n-1 are Input and column n is Target.

Definition at line 1921 of file data_set.cpp.

◆ set_display()

void set_display ( const bool &  new_display)

Sets a new display value. If it is set to true messages from this class are to be displayed on the screen; if it is set to false messages from this class are not to be displayed on the screen.

Parameters
new_displayDisplay value.

Definition at line 4785 of file data_set.cpp.

◆ set_gmt()

void set_gmt ( Index &  new_gmt)

Sets the value of the gmt, by default it is 0. This is recommended to use in forecasting problems.

Definition at line 5865 of file data_set.cpp.

◆ set_has_columns_names()

void set_has_columns_names ( const bool &  new_has_columns_names)

Sets if the data file contains a header with the names of the columns.

Definition at line 4866 of file data_set.cpp.

◆ set_has_rows_label()

void set_has_rows_label ( const bool &  new_has_rows_label)

Sets if the data file contains rows label.

Definition at line 4874 of file data_set.cpp.

◆ set_input()

void set_input ( )

Sets all the variables in the data set as input variables.

Definition at line 3428 of file data_set.cpp.

◆ set_input_columns()

void set_input_columns ( const Tensor< Index, 1 > &  input_columns_indices,
const Tensor< bool, 1 > &  input_columns_use 
)

Definition at line 3241 of file data_set.cpp.

◆ set_input_columns_unused()

void set_input_columns_unused ( )

Sets all input columns in the data_set as unused columns.

Definition at line 3229 of file data_set.cpp.

◆ set_input_target_columns()

void set_input_target_columns ( const Tensor< Index, 1 > &  input_columns,
const Tensor< Index, 1 > &  target_columns 
)

Definition at line 3211 of file data_set.cpp.

◆ set_input_variables_dimensions()

void set_input_variables_dimensions ( const Tensor< Index, 1 > &  new_inputs_dimensions)

Sets new input dimensions in the data set.

Definition at line 3650 of file data_set.cpp.

◆ set_lags_number()

void set_lags_number ( const Index &  new_lags_number)

Sets a new number of lags to be defined for a time series prediction application. When loading the data file, the time series data will be modified according to this number.

Parameters
new_lags_numberNumber of lags(x-1, ..., x-l) to be used.

Definition at line 5022 of file data_set.cpp.

◆ set_missing_values_label()

void set_missing_values_label ( const string &  new_missing_values_label)

Sets a new label for the missing values.

Parameters
new_missing_values_labelLabel for the missing values.

Definition at line 4961 of file data_set.cpp.

◆ set_missing_values_method() [1/2]

void set_missing_values_method ( const MissingValuesMethod new_missing_values_method)

Sets a new method for the missing values.

Parameters
new_missing_values_methodMethod for the missing values.

Definition at line 4985 of file data_set.cpp.

◆ set_missing_values_method() [2/2]

void set_missing_values_method ( const string &  new_missing_values_method)

Definition at line 4991 of file data_set.cpp.

◆ set_missing_values_number() [1/2]

void set_missing_values_number ( )

Definition at line 10810 of file data_set.cpp.

◆ set_missing_values_number() [2/2]

void set_missing_values_number ( const Index &  new_missing_values_number)

Definition at line 10804 of file data_set.cpp.

◆ set_rows_missing_values_number() [1/2]

void set_rows_missing_values_number ( )

Definition at line 10834 of file data_set.cpp.

◆ set_rows_missing_values_number() [2/2]

void set_rows_missing_values_number ( const Index &  new_rows_missing_values_number)

Definition at line 10828 of file data_set.cpp.

◆ set_sample_use() [1/2]

void set_sample_use ( const Index &  index,
const SampleUse new_use 
)

Sets the use of a single sample.

Parameters
indexIndex of sample.
new_useUse for that sample.

Definition at line 1591 of file data_set.cpp.

◆ set_sample_use() [2/2]

void set_sample_use ( const Index &  index,
const string &  new_use 
)

Sets the use of a single sample from a string.

Parameters
indexIndex of sample.
new_useString with the use name("Training", "Selection", "Testing" or "Unused")

Definition at line 1602 of file data_set.cpp.

◆ set_samples_number()

void set_samples_number ( const Index &  new_samples_number)

Sets a new number of samples in the data set. All samples are also set for training. The indices of the inputs and target variables do not change.

Parameters
new_samples_numberNumber of samples.

Definition at line 5062 of file data_set.cpp.

◆ set_samples_unused() [1/2]

void set_samples_unused ( )

Sets all the samples in the data set for unused.

Definition at line 1562 of file data_set.cpp.

◆ set_samples_unused() [2/2]

void set_samples_unused ( const Tensor< Index, 1 > &  indices)

Sets samples with given indices in the data set for unused.

Parameters
indicesIndices vector with the index of samples in the data set for unused.

Definition at line 1576 of file data_set.cpp.

◆ set_samples_uses() [1/2]

void set_samples_uses ( const Tensor< SampleUse, 1 > &  new_uses)

Sets new uses to all the samples from a single vector.

Parameters
new_usesvector of use structures. The size of given vector must be equal to the number of samples.

Definition at line 1637 of file data_set.cpp.

◆ set_samples_uses() [2/2]

void set_samples_uses ( const Tensor< string, 1 > &  new_uses)

Sets new uses to all the samples from a single vector of strings.

Parameters
new_usesvector of use strings. Possible values for the elements are "Training", "Selection", "Testing" and "Unused". The size of given vector must be equal to the number of samples.

Definition at line 1670 of file data_set.cpp.

◆ set_selection() [1/2]

void set_selection ( )

Sets all the samples in the data set for selection.

Definition at line 1488 of file data_set.cpp.

◆ set_selection() [2/2]

void set_selection ( const Tensor< Index, 1 > &  indices)

Sets samples with given indices in the data set for selection.

Parameters
indicesIndices vector with the index of samples in the data set for selection.

Definition at line 1531 of file data_set.cpp.

◆ set_separator() [1/3]

void set_separator ( const char &  new_separator)

Sets a new separator from a char.

Parameters
new_separatorChar with the separator value.

Definition at line 4892 of file data_set.cpp.

◆ set_separator() [2/3]

void set_separator ( const Separator new_separator)

Sets a new separator.

Parameters
new_separatorSeparator value.

Definition at line 4883 of file data_set.cpp.

◆ set_separator() [3/3]

void set_separator ( const string &  new_separator_string)

Sets a new separator from a string.

Parameters
new_separatorChar with the separator value.

Definition at line 4926 of file data_set.cpp.

◆ set_steps_ahead_number()

void set_steps_ahead_number ( const Index &  new_steps_ahead_number)

Sets a new number of steps ahead to be defined for a time series prediction application. When loading the data file, the time series data will be modified according to this number.

Parameters
new_steps_ahead_numberNumber of steps ahead to be used.

Definition at line 5032 of file data_set.cpp.

◆ set_target()

void set_target ( )

Sets all the variables in the data set as target variables.

Definition at line 3441 of file data_set.cpp.

◆ set_testing() [1/2]

void set_testing ( )

Sets all the samples in the data set for testing.

Definition at line 1501 of file data_set.cpp.

◆ set_testing() [2/2]

void set_testing ( const Tensor< Index, 1 > &  indices)

Sets samples with given indices in the data set for testing.

Parameters
indicesIndices vector with the index of samples in the data set for testing.

Definition at line 1547 of file data_set.cpp.

◆ set_threads_number()

void set_threads_number ( const int &  new_threads_number)

Definition at line 5047 of file data_set.cpp.

◆ set_time_column()

void set_time_column ( const string &  new_time_column)

Sets the new position where the time data is located in the data set.

Parameters
new_time_indexPosition where the time data is located.

Definition at line 5041 of file data_set.cpp.

◆ set_time_series_columns_number()

void set_time_series_columns_number ( const Index &  new_variables_number)

Definition at line 4847 of file data_set.cpp.

◆ set_time_series_data()

void set_time_series_data ( const Tensor< type, 2 > &  new_data)

Definition at line 4841 of file data_set.cpp.

◆ set_training() [1/2]

void set_training ( )

Sets all the samples in the data set for training.

Definition at line 1475 of file data_set.cpp.

◆ set_training() [2/2]

void set_training ( const Tensor< Index, 1 > &  indices)

Sets samples with given indices in the data set for training.

Parameters
indicesIndices vector with the index of samples in the data set for training.

Definition at line 1515 of file data_set.cpp.

◆ set_variable_name()

void set_variable_name ( const Index &  variable_index,
const string &  new_variable_name 
)

This method set the name of a single variable.

Parameters
indexIndex of variable.
new_nameName of variable.

Definition at line 3295 of file data_set.cpp.

◆ set_variables_names()

void set_variables_names ( const Tensor< string, 1 > &  new_variables_names)

Sets new names for the variables in the data set from a vector of strings. The size of that vector must be equal to the total number of variables.

Parameters
new_namesName of variables.

Definition at line 3355 of file data_set.cpp.

◆ set_variables_unused()

void set_variables_unused ( )

Sets all the variables in the data set as unused variables.

Definition at line 3452 of file data_set.cpp.

◆ shuffle()

void shuffle ( )

Definition at line 11113 of file data_set.cpp.

◆ split_isolation_tree()

Index split_isolation_tree ( Tensor< type, 2 > &  tree,
list< list< Index > > &  tree_simulation,
list< Index > &  tree_index 
) const
private

Definition at line 9082 of file data_set.cpp.

◆ split_samples()

Tensor< Index, 2 > split_samples ( const Tensor< Index, 1 > &  samples_indices,
const Index &  new_batch_size 
) const

Definition at line 10981 of file data_set.cpp.

◆ split_samples_random()

void split_samples_random ( const type &  training_samples_ratio = static_cast<type>(0.6),
const type &  selection_samples_ratio = static_cast<type>(0.2),
const type &  testing_samples_ratio = static_cast<type>(0.2) 
)

Creates new training, selection and testing indices at random.

Parameters
training_samples_ratioRatio of training samples in the data set.
selection_samples_ratioRatio of selection samples in the data set.
testing_samples_ratioRatio of testing samples in the data set.

Definition at line 1726 of file data_set.cpp.

◆ split_samples_sequential()

void split_samples_sequential ( const type &  training_samples_ratio = static_cast<type>(0.6),
const type &  selection_samples_ratio = static_cast<type>(0.2),
const type &  testing_samples_ratio = static_cast<type>(0.2) 
)

Creates new training, selection and testing indices with sequential indices.

Parameters
training_samples_ratioRatio of training samples in the data set.
selection_samples_ratioRatio of selection samples in the data set.
testing_samples_ratioRatio of testing samples in the data set.

Definition at line 1834 of file data_set.cpp.

◆ transform_binary_column()

Tensor< type, 2 > transform_binary_column ( const Tensor< type, 1 > &  column) const

Definition at line 3620 of file data_set.cpp.

◆ transform_time_series()

void transform_time_series ( )

Arranges an input-target DataSet from a time series matrix, according to the number of lags.

Definition at line 8007 of file data_set.cpp.

◆ transform_time_series_columns()

void transform_time_series_columns ( )

This method transforms the columns into time series for forecasting problems.

Definition at line 790 of file data_set.cpp.

◆ transform_time_series_data()

void transform_time_series_data ( )

Definition at line 870 of file data_set.cpp.

◆ unscale_data()

void unscale_data ( const Tensor< Descriptives, 1 > &  variables_descriptives)

Definition at line 6197 of file data_set.cpp.

◆ unscale_input_variables()

void unscale_input_variables ( const Tensor< Descriptives, 1 > &  input_variables_descriptives)

It unscales every input variable with the given method. The method to be used is that in the scaling and unscaling method variable.

Definition at line 6351 of file data_set.cpp.

◆ unscale_target_variables()

void unscale_target_variables ( const Tensor< Descriptives, 1 > &  targets_descriptives)

It unscales the input variables with that values. The method to be used is that in the scaling and unscaling method variable.

Definition at line 6397 of file data_set.cpp.

◆ unuse_constant_columns()

Tensor< string, 1 > unuse_constant_columns ( )

Removes the input of target indices of that variables with zero standard deviation. It might change the size of the vectors containing the inputs and targets indices.

Definition at line 5073 of file data_set.cpp.

◆ unuse_repeated_samples()

Tensor< Index, 1 > unuse_repeated_samples ( )

Removes the training, selection and testing indices of that samples which are repeated in the data matrix. It might change the size of the vectors containing the training, selection and testing indices.

Definition at line 5112 of file data_set.cpp.

◆ unuse_Tukey_outliers()

void unuse_Tukey_outliers ( const type &  cleaning_parameter = type(1.5))

Calculate the outliers from the data set using the Tukey's test and sets in samples object.

Parameters
cleaning_parameterParameter used to detect outliers
Todo:

Definition at line 8624 of file data_set.cpp.

◆ unuse_uncorrelated_columns()

Tensor< string, 1 > unuse_uncorrelated_columns ( const type &  minimum_correlation = type(0.25))

Return unused variables without correlation.

Parameters
minimum_correlationMinimum correlation between variables.

Definition at line 5163 of file data_set.cpp.

◆ write_XML()

void write_XML ( tinyxml2::XMLPrinter file_stream) const

Serializes the data set object into a XML document of the TinyXML library without keep the DOM tree in memory.

Definition at line 6486 of file data_set.cpp.

Member Data Documentation

◆ columns

Tensor<Column, 1> columns
private

Definition at line 764 of file data_set.h.

◆ columns_missing_values_number

Tensor<Index, 1> columns_missing_values_number
private

Definition at line 826 of file data_set.h.

◆ data

Tensor<type, 2> data
private

Data Matrix. The number of rows is the number of samples. The number of columns is the number of variables.

Definition at line 754 of file data_set.h.

◆ data_file_name

string data_file_name
private

Data file name.

Definition at line 772 of file data_set.h.

◆ data_file_preview

Tensor<Tensor<string, 1>, 1> data_file_preview
private

Definition at line 790 of file data_set.h.

◆ display

bool display = true
private

Display messages to screen.

Definition at line 832 of file data_set.h.

◆ gmt

Index gmt = 0
private

Definition at line 814 of file data_set.h.

◆ has_columns_names

bool has_columns_names = false
private

Header which contains variables name.

Definition at line 784 of file data_set.h.

◆ has_rows_labels

bool has_rows_labels = false
private

Header which contains the rows label.

Definition at line 788 of file data_set.h.

◆ input_variables_dimensions

Tensor<Index, 1> input_variables_dimensions
private

Definition at line 766 of file data_set.h.

◆ lags_number

Index lags_number = 0
private

Number of lags.

Definition at line 800 of file data_set.h.

◆ missing_values_label

string missing_values_label = "NA"
private

Missing values label.

Definition at line 780 of file data_set.h.

◆ missing_values_method

MissingValuesMethod missing_values_method = MissingValuesMethod::Unuse
private

Missing values method.

Definition at line 820 of file data_set.h.

◆ missing_values_number

Index missing_values_number
private

Missing values.

Definition at line 824 of file data_set.h.

◆ non_blocking_thread_pool

NonBlockingThreadPool* non_blocking_thread_pool = nullptr
private

Definition at line 745 of file data_set.h.

◆ product_vector_vector

const Eigen::array<IndexPair<Index>, 1> product_vector_vector = {IndexPair<Index>(0, 0)}
private

Definition at line 834 of file data_set.h.

◆ rows_labels

Tensor<string, 1> rows_labels
private

Definition at line 760 of file data_set.h.

◆ rows_missing_values_number

Index rows_missing_values_number
private

Definition at line 828 of file data_set.h.

◆ samples_uses

Tensor<SampleUse, 1> samples_uses
private

Definition at line 758 of file data_set.h.

◆ separator

Separator separator = Separator::Comma
private

Separator character.

Definition at line 776 of file data_set.h.

◆ steps_ahead

Index steps_ahead = 0
private

Number of steps ahead.

Definition at line 804 of file data_set.h.

◆ thread_pool_device

ThreadPoolDevice* thread_pool_device = nullptr
private

Definition at line 746 of file data_set.h.

◆ time_column

string time_column
private

Index where time variable is located for forecasting applications.

Definition at line 796 of file data_set.h.

◆ time_series_columns

Tensor<Column, 1> time_series_columns
private

Definition at line 812 of file data_set.h.

◆ time_series_data

Tensor<type, 2> time_series_data
private

Time series data matrix. The number of rows is the number of samples before time series transformation. The number of columns is the number of variables before time series transformation.

Definition at line 810 of file data_set.h.


The documentation for this class was generated from the following files: