This class represents the concept of data set for data modelling problems, such as approximation, classification or forecasting. More...
#include <data_set.h>
Classes | |
struct | Column |
This structure represents the columns of the DataSet. More... | |
Public Types | |
enum class | Separator { Space , Tab , Comma , Semicolon } |
Enumeration of available separators for the data file. More... | |
enum class | MissingValuesMethod { Unuse , Mean , Median } |
Enumeration of available methods for missing values in the data. More... | |
enum class | ProjectType { Approximation , Classification , Forecasting , ImageClassification } |
Enumeration of the learning tasks. More... | |
enum class | SampleUse { Training , Selection , Testing , UnusedSample } |
enum class | VariableUse { Id , Input , Target , Time , UnusedVariable } |
enum class | ColumnType { Numeric , Binary , Categorical , DateTime , Constant } |
Public Member Functions | |
DataSet () | |
DataSet (const Tensor< type, 2 > &) | |
DataSet (const Index &, const Index &) | |
DataSet (const Index &, const Index &, const Index &) | |
DataSet (const string &, const char &, const bool &) | |
virtual | ~DataSet () |
Destructor. More... | |
Index | get_samples_number () const |
Index | get_training_samples_number () const |
Returns the number of samples in the data set which will be used for training. More... | |
Index | get_selection_samples_number () const |
Returns the number of samples in the data set which will be used for selection. More... | |
Index | get_testing_samples_number () const |
Returns the number of samples in the data set which will be used for testing. More... | |
Index | get_used_samples_number () const |
Index | get_unused_samples_number () const |
Tensor< Index, 1 > | get_training_samples_indices () const |
Returns the indices of the samples which will be used for training. More... | |
Tensor< Index, 1 > | get_selection_samples_indices () const |
Returns the indices of the samples which will be used for selection. More... | |
Tensor< Index, 1 > | get_testing_samples_indices () const |
Returns the indices of the samples which will be used for testing. More... | |
Tensor< Index, 1 > | get_used_samples_indices () const |
Returns the indices of the used samples(those which are not set unused). More... | |
Tensor< Index, 1 > | get_unused_samples_indices () const |
Returns the indices of the samples set unused. More... | |
SampleUse | get_sample_use (const Index &) const |
const Tensor< SampleUse, 1 > & | get_samples_uses () const |
Returns the use of every sample (training, selection, testing or unused) in a vector. More... | |
Tensor< Index, 1 > | get_samples_uses_numbers () const |
Tensor< type, 1 > | get_samples_uses_percentages () const |
string | get_sample_string (const Index &, const string &=",") const |
Tensor< Column, 1 > | get_columns () const |
Returns the columns of the data set. More... | |
Tensor< Column, 1 > | get_time_series_columns () const |
Index | get_time_series_data_rows_number () const |
Tensor< Column, 1 > | get_input_columns () const |
Returns the input columns of the data set. More... | |
Tensor< bool, 1 > | get_input_columns_binary () const |
Returns the input columns of the data set. More... | |
Tensor< Column, 1 > | get_target_columns () const |
Returns the target columns of the data set. More... | |
Tensor< Column, 1 > | get_used_columns () const |
Returns the used columns of the data set. More... | |
Index | get_columns_number () const |
Returns the number of columns in the data set. More... | |
Index | get_input_columns_number () const |
Returns the number of columns whose uses are Input. More... | |
Index | get_input_time_series_columns_number () const |
Index | get_target_columns_number () const |
Returns the number of columns whose uses are Target. More... | |
Index | get_target_time_series_columns_number () const |
Index | get_time_columns_number () const |
Returns the number of columns whose uses are Time. More... | |
Index | get_unused_columns_number () const |
Returns the number of columns that are not used. More... | |
Index | get_used_columns_number () const |
Returns the number of columns that are used. More... | |
Index | get_column_index (const string &) const |
Index | get_column_index (const Index &) const |
Tensor< Index, 1 > | get_input_columns_indices () const |
Returns a indices vector with the positions of the inputs. More... | |
Tensor< Index, 1 > | get_input_time_series_columns_indices () const |
Tensor< Index, 1 > | get_target_columns_indices () const |
Returns a indices vector with the positions of the targets. More... | |
Tensor< Index, 1 > | get_target_time_series_columns_indices () const |
Tensor< Index, 1 > | get_unused_columns_indices () const |
Returns a indices vector with the positions of the unused columns. More... | |
Tensor< Index, 1 > | get_used_columns_indices () const |
Returns a indices vector with the positions of the used columns. More... | |
Tensor< string, 1 > | get_columns_names () const |
Returns a string vector that contains the names of the columns. More... | |
Tensor< string, 1 > | get_input_columns_names () const |
Returns a string vector that contains the names of the columns whose uses are Input. More... | |
Tensor< string, 1 > | get_target_columns_names () const |
Returns a string vector which contains the names of the columns whose uses are Target. More... | |
Tensor< string, 1 > | get_used_columns_names () const |
Returns a string vector which contains the names of the columns used whether Input, Target or Time. More... | |
ColumnType | get_column_type (const Index &index) const |
VariableUse | get_column_use (const Index &) const |
Returns a vector containing the use of the column, without taking into account the categories. More... | |
Tensor< VariableUse, 1 > | get_columns_uses () const |
Returns the uses of each columns of the data set. More... | |
Index | get_variables_number () const |
Returns the number of variables in the data set. More... | |
Index | get_time_series_variables_number () const |
Returns the number of variables in the time series data. More... | |
Index | get_input_variables_number () const |
Index | get_target_variables_number () const |
Returns the number of target variables of the data set. More... | |
Index | get_unused_variables_number () const |
Returns the number of variables which will neither be used as input nor as target. More... | |
Index | get_used_variables_number () const |
Returns the number of variables which are either input nor target. More... | |
string | get_variable_name (const Index &) const |
Tensor< string, 1 > | get_variables_names () const |
Tensor< string, 1 > | get_time_series_variables_names () const |
Tensor< string, 1 > | get_input_variables_names () const |
Tensor< string, 1 > | get_target_variables_names () const |
Index | get_variable_index (const string &name) const |
Tensor< Index, 1 > | get_variable_indices (const Index &) const |
Tensor< Index, 1 > | get_unused_variables_indices () const |
Returns the indices of the unused variables. More... | |
Tensor< Index, 1 > | get_used_variables_indices () const |
Returns the indices of the used variables. More... | |
Tensor< Index, 1 > | get_input_variables_indices () const |
Returns the indices of the input variables. More... | |
Tensor< Index, 1 > | get_target_variables_indices () const |
Returns the indices of the target variables. More... | |
VariableUse | get_variable_use (const Index &) const |
Tensor< VariableUse, 1 > | get_variables_uses () const |
const Tensor< Index, 1 > & | get_input_variables_dimensions () const |
Returns the dimensions of the input variables. More... | |
Index | get_input_variables_rank () const |
Tensor< Scaler, 1 > | get_columns_scalers () const |
Tensor< Scaler, 1 > | get_input_variables_scalers () const |
Tensor< Scaler, 1 > | get_target_variables_scalers () const |
Tensor< Index, 2 > | get_batches (const Tensor< Index, 1 > &, const Index &, const bool &, const Index &buffer_size=100) const |
const Tensor< type, 2 > & | get_data () const |
Tensor< type, 2 > * | get_data_pointer () |
const Tensor< type, 2 > & | get_time_series_data () const |
Tensor< type, 2 > | get_training_data () const |
Tensor< type, 2 > | get_selection_data () const |
Tensor< type, 2 > | get_testing_data () const |
Tensor< string, 1 > | get_time_series_columns_names () const |
Index | get_time_series_columns_number () const |
Returns the number of columns in the time series. More... | |
Tensor< type, 2 > | get_input_data () const |
Tensor< type, 2 > | get_target_data () const |
Tensor< type, 2 > | get_input_data (const Tensor< Index, 1 > &) const |
Tensor< type, 2 > | get_target_data (const Tensor< Index, 1 > &) const |
Tensor< type, 2 > | get_training_input_data () const |
Tensor< type, 2 > | get_training_target_data () const |
Tensor< type, 2 > | get_selection_input_data () const |
Tensor< type, 2 > | get_selection_target_data () const |
Tensor< type, 2 > | get_testing_input_data () const |
Tensor< type, 2 > | get_testing_target_data () const |
Tensor< type, 1 > | get_sample_data (const Index &) const |
Tensor< type, 1 > | get_sample_data (const Index &, const Tensor< Index, 1 > &) const |
Tensor< type, 2 > | get_sample_input_data (const Index &) const |
Tensor< type, 2 > | get_sample_target_data (const Index &) const |
Tensor< type, 2 > | get_column_data (const Index &) const |
Tensor< type, 2 > | get_column_data (const Index &, const Tensor< Index, 1 > &) const |
Tensor< type, 2 > | get_column_data (const Tensor< Index, 1 > &) const |
Tensor< type, 2 > | get_column_data (const string &) const |
Tensor< type, 1 > | get_variable_data (const Index &) const |
Tensor< type, 1 > | get_variable_data (const string &) const |
Tensor< type, 1 > | get_variable_data (const Index &, const Tensor< Index, 1 > &) const |
Tensor< type, 1 > | get_variable_data (const string &, const Tensor< Index, 1 > &) const |
Tensor< Tensor< string, 1 >, 1 > | get_data_file_preview () const |
Tensor< type, 2 > | get_subtensor_data (const Tensor< Index, 1 > &, const Tensor< Index, 1 > &) const |
MissingValuesMethod | get_missing_values_method () const |
Returns a string with the method used. More... | |
const string & | get_data_file_name () const |
Returns the name of the data file. More... | |
const bool & | get_header_line () const |
Returns true if the first line of the data file has a header with the names of the variables, and false otherwise. More... | |
const bool & | get_rows_label () const |
Returns true if the data file has rows label, and false otherwise. More... | |
Tensor< string, 1 > | get_rows_label_tensor () const |
Tensor< string, 1 > | get_selection_rows_label_tensor () |
Tensor< string, 1 > | get_testing_rows_label_tensor () |
const Separator & | get_separator () const |
Returns the separator to be used in the data file. More... | |
char | get_separator_char () const |
Returns the string which will be used as separator in the data file. More... | |
string | get_separator_string () const |
Returns the string which will be used as separator in the data file. More... | |
const string & | get_missing_values_label () const |
Returns the string which will be used as label for the missing values in the data file. More... | |
const Index & | get_lags_number () const |
Returns the number of lags to be used in a time series prediction application. More... | |
const Index & | get_steps_ahead () const |
Returns the number of steps ahead to be used in a time series prediction application. More... | |
const string & | get_time_column () const |
Returns the indices of the time variables in the data set. More... | |
Index | get_time_series_time_column_index () const |
Index | get_gmt () const |
const bool & | get_display () const |
void | set () |
Sets zero samples and zero variables in the data set. More... | |
void | set (const Tensor< type, 2 > &) |
void | set (const Index &, const Index &) |
void | set (const Index &, const Index &, const Index &) |
void | set (const DataSet &) |
void | set (const tinyxml2::XMLDocument &) |
void | set (const string &) |
void | set (const string &, const char &, const bool &) |
void | set_default () |
void | set_threads_number (const int &) |
void | set_samples_number (const Index &) |
void | set_training () |
Sets all the samples in the data set for training. More... | |
void | set_selection () |
Sets all the samples in the data set for selection. More... | |
void | set_testing () |
Sets all the samples in the data set for testing. More... | |
void | set_training (const Tensor< Index, 1 > &) |
void | set_selection (const Tensor< Index, 1 > &) |
void | set_testing (const Tensor< Index, 1 > &) |
void | set_samples_unused () |
Sets all the samples in the data set for unused. More... | |
void | set_samples_unused (const Tensor< Index, 1 > &) |
void | set_sample_use (const Index &, const SampleUse &) |
void | set_sample_use (const Index &, const string &) |
void | set_samples_uses (const Tensor< SampleUse, 1 > &) |
void | set_samples_uses (const Tensor< string, 1 > &) |
void | set_columns (const Tensor< Column, 1 > &) |
void | set_default_columns_uses () |
void | set_default_columns_names () |
void | set_column_name (const Index &, const string &) |
void | set_columns_uses (const Tensor< string, 1 > &) |
void | set_columns_uses (const Tensor< VariableUse, 1 > &) |
void | set_columns_unused () |
Sets all columns in the data_set as unused columns. More... | |
void | set_input_target_columns (const Tensor< Index, 1 > &, const Tensor< Index, 1 > &) |
void | set_input_columns_unused () |
Sets all input columns in the data_set as unused columns. More... | |
void | set_input_columns (const Tensor< Index, 1 > &, const Tensor< bool, 1 > &) |
void | set_column_use (const Index &, const VariableUse &) |
void | set_column_use (const string &, const VariableUse &) |
void | set_column_type (const Index &, const ColumnType &) |
void | set_column_type (const string &, const ColumnType &) |
void | set_columns_names (const Tensor< string, 1 > &) |
void | set_columns_number (const Index &) |
void | set_columns_scalers (const Scaler &) |
void | set_binary_simple_columns () |
void | check_constant_columns () |
Tensor< type, 2 > | transform_binary_column (const Tensor< type, 1 > &) const |
void | set_variables_names (const Tensor< string, 1 > &) |
void | set_variable_name (const Index &, const string &) |
void | set_input () |
Sets all the variables in the data set as input variables. More... | |
void | set_target () |
Sets all the variables in the data set as target variables. More... | |
void | set_variables_unused () |
Sets all the variables in the data set as unused variables. More... | |
void | set_input_variables_dimensions (const Tensor< Index, 1 > &) |
Sets new input dimensions in the data set. More... | |
void | set_data (const Tensor< type, 2 > &) |
void | set_data_file_name (const string &) |
void | set_has_columns_names (const bool &) |
Sets if the data file contains a header with the names of the columns. More... | |
void | set_has_rows_label (const bool &) |
Sets if the data file contains rows label. More... | |
void | set_separator (const Separator &) |
void | set_separator (const string &) |
void | set_separator (const char &) |
void | set_missing_values_label (const string &) |
void | set_missing_values_method (const MissingValuesMethod &) |
void | set_missing_values_method (const string &) |
void | set_lags_number (const Index &) |
void | set_steps_ahead_number (const Index &) |
void | set_time_column (const string &) |
void | set_gmt (Index &) |
void | set_display (const bool &) |
bool | is_empty () const |
Returns true if the data matrix is empty, and false otherwise. More... | |
bool | is_sample_used (const Index &) const |
bool | is_sample_unused (const Index &) const |
bool | has_binary_columns () const |
bool | has_categorical_columns () const |
bool | has_time_columns () const |
bool | has_time_time_series_columns () const |
bool | has_selection () const |
void | split_samples_sequential (const type &training_ratio=static_cast< type >(0.6), const type &selection_ratio=static_cast< type >(0.2), const type &testing_ratio=static_cast< type >(0.2)) |
void | split_samples_random (const type &training_ratio=static_cast< type >(0.6), const type &selection_ratio=static_cast< type >(0.2), const type &testing_ratio=static_cast< type >(0.2)) |
Tensor< string, 1 > | unuse_constant_columns () |
Tensor< Index, 1 > | unuse_repeated_samples () |
Tensor< string, 1 > | unuse_uncorrelated_columns (const type &=type(0.25)) |
void | set_data_constant (const type &) |
void | set_data_random () |
void | set_data_binary_random () |
Tensor< Descriptives, 1 > | calculate_variables_descriptives () const |
Tensor< Descriptives, 1 > | calculate_used_variables_descriptives () const |
Tensor< Descriptives, 1 > | calculate_columns_descriptives_positive_samples () const |
Calculate the descriptives of the samples with positive targets in binary classification problems. More... | |
Tensor< Descriptives, 1 > | calculate_columns_descriptives_negative_samples () const |
Calculate the descriptives of the samples with neagtive targets in binary classification problems. More... | |
Tensor< Descriptives, 1 > | calculate_columns_descriptives_categories (const Index &) const |
Tensor< Descriptives, 1 > | calculate_columns_descriptives_training_samples () const |
Tensor< Descriptives, 1 > | calculate_columns_descriptives_selection_samples () const |
Tensor< Descriptives, 1 > | calculate_input_variables_descriptives () const |
Tensor< Descriptives, 1 > | calculate_target_variables_descriptives () const |
Tensor< Descriptives, 1 > | calculate_testing_target_variables_descriptives () const |
Tensor< type, 1 > | calculate_input_variables_minimums () const |
Returns a vector containing the minimums of the input variables. More... | |
Tensor< type, 1 > | calculate_target_variables_minimums () const |
Returns a vector containing the minimums of the target variables. More... | |
Tensor< type, 1 > | calculate_input_variables_maximums () const |
Returns a vector containing the maximums of the input variables. More... | |
Tensor< type, 1 > | calculate_target_variables_maximums () const |
Returns a vector containing the maximums of the target variables. More... | |
Tensor< type, 1 > | calculate_variables_means (const Tensor< Index, 1 > &) const |
Tensor< type, 1 > | calculate_used_variables_minimums () const |
Returns a vector containing the maximum of the used variables. More... | |
Tensor< type, 1 > | calculate_used_targets_mean () const |
Tensor< type, 1 > | calculate_selection_targets_mean () const |
Returns the mean values of the target variables on the selection. More... | |
Index | calculate_used_negatives (const Index &) const |
Index | calculate_training_negatives (const Index &) const |
Index | calculate_selection_negatives (const Index &) const |
Index | calculate_testing_negatives (const Index &) const |
Tensor< Histogram, 1 > | calculate_columns_distribution (const Index &=10) const |
Tensor< BoxPlot, 1 > | calculate_columns_box_plots () const |
Tensor< Correlation, 2 > | calculate_input_columns_correlations () const |
void | print_inputs_correlations () const |
Print on screen the correlation between variables in the data set. More... | |
void | print_top_inputs_correlations () const |
Tensor< Correlation, 2 > | calculate_input_target_columns_correlations () const |
void | print_input_target_columns_correlations () const |
Print on screen the correlation between targets and inputs. More... | |
void | print_top_input_target_columns_correlations () const |
Tensor< Index, 1 > | filter_data (const Tensor< type, 1 > &, const Tensor< type, 1 > &) |
void | set_default_columns_scalers () |
Tensor< Descriptives, 1 > | scale_data () |
Tensor< Descriptives, 1 > | scale_input_variables () |
Tensor< Descriptives, 1 > | scale_target_variables () |
void | unscale_data (const Tensor< Descriptives, 1 > &) |
void | unscale_input_variables (const Tensor< Descriptives, 1 > &) |
void | unscale_target_variables (const Tensor< Descriptives, 1 > &) |
Tensor< Index, 1 > | calculate_target_distribution () const |
Tensor< Tensor< Index, 1 >, 1 > | calculate_Tukey_outliers (const type &=type(1.5)) const |
void | unuse_Tukey_outliers (const type &=type(1.5)) |
Tensor< Index, 1 > | calculate_local_outlier_factor_outliers (const Index &=20, const Index &=0, const type &=type(0)) const |
void | unuse_local_outlier_factor_outliers (const Index &=20, const type &=type(1.5)) |
Tensor< Index, 1 > | calculate_isolation_forest_outliers (const Index &=100, const Index &=256, const type &=type(0)) const |
void | unuse_isolation_forest_outliers (const Index &=20, const type &=type(1.5)) |
void | transform_time_series () |
Arranges an input-target DataSet from a time series matrix, according to the number of lags. More... | |
void | transform_time_series_columns () |
This method transforms the columns into time series for forecasting problems. More... | |
void | transform_time_series_data () |
void | get_time_series_columns_number (const Index &) |
void | set_time_series_data (const Tensor< type, 2 > &) |
void | set_time_series_columns_number (const Index &) |
Tensor< type, 2 > | get_time_series_column_data (const Index &) const |
Tensor< type, 2 > | calculate_autocorrelations (const Index &=10) const |
Tensor< type, 3 > | calculate_cross_correlations (const Index &=10) const |
Calculates the cross-correlation between all the variables in the data set. More... | |
void | generate_constant_data (const Index &, const Index &, const type &) |
void | generate_random_data (const Index &, const Index &) |
void | generate_sequential_data (const Index &, const Index &) |
void | generate_Rosenbrock_data (const Index &, const Index &) |
void | generate_sum_data (const Index &, const Index &) |
void | print () const |
Prints to the screen in text format the main numbers from the data set object. More... | |
void | from_XML (const tinyxml2::XMLDocument &) |
void | write_XML (tinyxml2::XMLPrinter &) const |
Serializes the data set object into a XML document of the TinyXML library without keep the DOM tree in memory. More... | |
void | save (const string &) const |
void | load (const string &) |
void | print_columns () const |
void | print_columns_types () const |
void | print_columns_uses () const |
void | print_data () const |
Prints to the screen the values of the data matrix. More... | |
void | print_data_preview () const |
void | print_data_file_preview () const |
void | save_data () const |
Saves to the data file the values of the data matrix. More... | |
void | save_data_binary (const string &) const |
Saves to the data file the values of the data matrix in binary format. More... | |
void | save_time_series_data_binary (const string &) const |
Saves to the data file the values of the time series data matrix in binary format. More... | |
void | read_csv () |
void | load_data_binary () |
This method loads the data from a binary data file. More... | |
void | load_time_series_data_binary (const string &) |
This method loads time series data from a binary data. More... | |
void | check_input_csv (const string &, const char &) const |
This method checks if the input data file has the correct format. Returns an error message. More... | |
Tensor< type, 2 > | read_input_csv (const string &, const char &, const string &, const bool &, const bool &) const |
This method loads data from a file and returns a matrix containing the input columns. More... | |
void | fill_time_series (const Index &) |
bool | has_nan () const |
Returns true if the data contain missing values. More... | |
bool | has_nan_row (const Index &) const |
Returns true if the given row contains missing values. More... | |
void | print_missing_values_information () const |
void | impute_missing_values_unuse () |
Sets all the samples with missing values to "Unused". More... | |
void | impute_missing_values_mean () |
Substitutes all the missing values by the mean of the corresponding variable. More... | |
void | impute_missing_values_median () |
Substitutes all the missing values by the median of the corresponding variable. More... | |
void | scrub_missing_values () |
Tensor< Index, 1 > | count_nan_columns () const |
Index | count_rows_with_nan () const |
Index | count_nan () const |
void | set_missing_values_number (const Index &) |
void | set_missing_values_number () |
void | set_columns_missing_values_number (const Tensor< Index, 1 > &) |
void | set_columns_missing_values_number () |
void | set_rows_missing_values_number (const Index &) |
void | set_rows_missing_values_number () |
void | fix_repeated_names () |
Tensor< Index, 1 > | push_back (const Tensor< Index, 1 > &, const Index &) const |
Tensor< string, 1 > | push_back (const Tensor< string, 1 > &, const string &) const |
void | initialize_sequential (Tensor< Index, 1 > &, const Index &, const Index &, const Index &) const |
void | intialize_sequential (Tensor< type, 1 > &, const type &, const type &, const type &) const |
Tensor< Index, 2 > | split_samples (const Tensor< Index, 1 > &, const Index &) const |
bool | get_has_rows_labels () const |
void | shuffle () |
void | read_csv_1 () |
void | read_csv_2_simple () |
void | read_csv_3_simple () |
void | read_csv_2_complete () |
void | read_csv_3_complete () |
void | check_separators (const string &) const |
void | check_special_characters (const string &) const |
Static Public Member Functions | |
static Tensor< string, 1 > | get_default_columns_names (const Index &) |
static Scaler | get_scaling_unscaling_method (const string &) |
Private Member Functions | |
Tensor< Index, 1 > | select_outliers_via_standard_deviation (const Tensor< type, 1 > &, const type &=type(2.0), bool=true) const |
Tensor< Index, 1 > | select_outliers_via_contamination (const Tensor< type, 1 > &, const type &=type(0.05), bool=true) const |
type | calculate_euclidean_distance (const Tensor< Index, 1 > &, const Index &, const Index &) const |
Tensor< type, 2 > | calculate_distance_matrix (const Tensor< Index, 1 > &) const |
Tensor< list< Index >, 1 > | calculate_k_nearest_neighbors (const Tensor< type, 2 > &, const Index &=20) const |
Tensor< Tensor< type, 1 >, 1 > | get_kd_tree_data () const |
Tensor< Tensor< Index, 1 >, 1 > | create_bounding_limits_kd_tree (const Index &) const |
void | create_kd_tree (Tensor< Tensor< type, 1 >, 1 > &, const Tensor< Tensor< Index, 1 >, 1 > &) const |
Tensor< list< Index >, 1 > | calculate_bounding_boxes_neighbors (const Tensor< Tensor< type, 1 >, 1 > &, const Tensor< Index, 1 > &, const Index &, const Index &) const |
Tensor< list< Index >, 1 > | calculate_kd_tree_neighbors (const Index &=20, const Index &=40) const |
Tensor< type, 1 > | calculate_average_reachability (Tensor< list< Index >, 1 > &, const Index &) const |
Tensor< type, 1 > | calculate_local_outlier_factor (Tensor< list< Index >, 1 > &, const Tensor< type, 1 > &, const Index &) const |
void | calculate_min_max_indices_list (list< Index > &, const Index &, type &, type &) const |
Index | split_isolation_tree (Tensor< type, 2 > &, list< list< Index > > &, list< Index > &) const |
Tensor< type, 2 > | create_isolation_tree (const Tensor< Index, 1 > &, const Index &) const |
Tensor< Tensor< type, 2 >, 1 > | create_isolation_forest (const Index &, const Index &, const Index &) const |
type | calculate_tree_path (const Tensor< type, 2 > &, const Index &, const Index &) const |
Tensor< type, 1 > | calculate_average_forest_paths (const Tensor< Tensor< type, 2 >, 1 > &, const Index &) const |
Private Attributes | |
NonBlockingThreadPool * | non_blocking_thread_pool = nullptr |
ThreadPoolDevice * | thread_pool_device = nullptr |
Tensor< type, 2 > | data |
Tensor< SampleUse, 1 > | samples_uses |
Tensor< string, 1 > | rows_labels |
Tensor< Column, 1 > | columns |
Tensor< Index, 1 > | input_variables_dimensions |
string | data_file_name |
Data file name. More... | |
Separator | separator = Separator::Comma |
Separator character. More... | |
string | missing_values_label = "NA" |
Missing values label. More... | |
bool | has_columns_names = false |
Header which contains variables name. More... | |
bool | has_rows_labels = false |
Header which contains the rows label. More... | |
Tensor< Tensor< string, 1 >, 1 > | data_file_preview |
string | time_column |
Index where time variable is located for forecasting applications. More... | |
Index | lags_number = 0 |
Number of lags. More... | |
Index | steps_ahead = 0 |
Number of steps ahead. More... | |
Tensor< type, 2 > | time_series_data |
Tensor< Column, 1 > | time_series_columns |
Index | gmt = 0 |
MissingValuesMethod | missing_values_method = MissingValuesMethod::Unuse |
Missing values method. More... | |
Index | missing_values_number |
Missing values. More... | |
Tensor< Index, 1 > | columns_missing_values_number |
Index | rows_missing_values_number |
bool | display = true |
Display messages to screen. More... | |
const Eigen::array< IndexPair< Index >, 1 > | product_vector_vector = {IndexPair<Index>(0, 0)} |
This class represents the concept of data set for data modelling problems, such as approximation, classification or forecasting.
It basically consists of a data Matrix separated by columns. These columns can take different categories depending on the data hosted in them.
With OpenNN DataSet class you can edit the data to prepare your model, such as eliminating missing values, calculating correlations between variables (inputs and targets), not using certain variables or samples, etc \dots.
Definition at line 56 of file data_set.h.
|
strong |
This enumeration represents the data type of a column (numeric, binary, categorical or time).
Definition at line 104 of file data_set.h.
|
strong |
Enumeration of available methods for missing values in the data.
Definition at line 85 of file data_set.h.
|
strong |
Enumeration of the learning tasks.
Definition at line 89 of file data_set.h.
|
strong |
This enumeration represents the possible uses of an sample (training, selection, testing or unused).
Definition at line 94 of file data_set.h.
|
strong |
Enumeration of available separators for the data file.
Definition at line 81 of file data_set.h.
|
strong |
This enumeration represents the possible uses of an variable (input, target, time or unused).
Definition at line 99 of file data_set.h.
|
explicit |
Default constructor. It creates a data set object with zero samples and zero inputs and target variables. It also initializes the rest of class members to their default values.
Definition at line 20 of file data_set.cpp.
|
explicit |
Default constructor. It creates a data set object from data Eigen Matrix. It also initializes the rest of class members to their default values.
data | Data Tensor<type, 2>. |
Definition at line 32 of file data_set.cpp.
|
explicit |
Samples and variables number constructor. It creates a data set object with given samples and variables numbers. All the variables are set as inputs. It also initializes the rest of class members to their default values.
new_samples_number | Number of samples in the data set. |
new_variables_number | Number of variables. |
Definition at line 47 of file data_set.cpp.
|
explicit |
Samples number, input variables number and target variables number constructor. It creates a data set object with given samples and inputs and target variables numbers. It also initializes the rest of class members to their default values.
new_samples_number | Number of samples in the data set. |
new_inputs_number | Number of input variables. |
new_targets_number | Number of target variables. |
Definition at line 62 of file data_set.cpp.
|
explicit |
File and separator constructor. It creates a data set object by loading the object members from a data file. It also sets a separator. Please mind about the file format. This is specified in the User's Guide.
data_file_name | Data file file name. |
separator | Data file file name. |
Definition at line 76 of file data_set.cpp.
|
virtual |
Destructor.
Definition at line 84 of file data_set.cpp.
Tensor< type, 2 > calculate_autocorrelations | ( | const Index & | lags_number = 10 | ) | const |
Returns a matrix with the values of autocorrelation for every variable in the data set. The number of rows is equal to the number of The number of columns is the maximum lags number.
maximum_lags_number | Maximum lags number for which autocorrelation is calculated. |
Definition at line 9315 of file data_set.cpp.
|
private |
Definition at line 9269 of file data_set.cpp.
|
private |
Definition at line 8935 of file data_set.cpp.
|
private |
Definition at line 8874 of file data_set.cpp.
Tensor< BoxPlot, 1 > calculate_columns_box_plots | ( | ) | const |
Returns a vector of subvectors with the values of a box and whiskers plot. The size of the vector is equal to the number of used variables. The size of the subvectors is 5 and they consist on:
Definition at line 5320 of file data_set.cpp.
Tensor< Descriptives, 1 > calculate_columns_descriptives_categories | ( | const Index & | class_index | ) | const |
Returns a matrix with the data set descriptive statistics.
class_index | Data set index number to make the descriptive statistics. |
Definition at line 5645 of file data_set.cpp.
Tensor< Descriptives, 1 > calculate_columns_descriptives_negative_samples | ( | ) | const |
Calculate the descriptives of the samples with neagtive targets in binary classification problems.
Definition at line 5584 of file data_set.cpp.
Tensor< Descriptives, 1 > calculate_columns_descriptives_positive_samples | ( | ) | const |
Calculate the descriptives of the samples with positive targets in binary classification problems.
Definition at line 5525 of file data_set.cpp.
Tensor< Descriptives, 1 > calculate_columns_descriptives_selection_samples | ( | ) | const |
Returns a vector of vectors containing some basic descriptives of all variables on the selection The size of this vector is two. The subvectors are:
Definition at line 5712 of file data_set.cpp.
Tensor< Descriptives, 1 > calculate_columns_descriptives_training_samples | ( | ) | const |
Returns a vector of vectors containing some basic descriptives of all variables on the training The size of this vector is two. The subvectors are:
Definition at line 5693 of file data_set.cpp.
Tensor< Histogram, 1 > calculate_columns_distribution | ( | const Index & | bins_number = 10 | ) | const |
Returns the distribution of each of the columns. In the case of numeric columns, it returns a histogram, for the case of categorical columns, it returns the frequencies of each category and for the binary columns it returns the frequencies of the positives and negatives. The default number of bins is 10.
bins_number | Number of bins. |
Definition at line 5201 of file data_set.cpp.
Tensor< type, 3 > calculate_cross_correlations | ( | const Index & | lags_number = 10 | ) | const |
Calculates the cross-correlation between all the variables in the data set.
Definition at line 9424 of file data_set.cpp.
|
private |
Definition at line 8727 of file data_set.cpp.
|
private |
Definition at line 8707 of file data_set.cpp.
Tensor< Correlation, 2 > calculate_input_columns_correlations | ( | ) | const |
Calculate the correlation between each input in the data set. Returns a matrix with the correlation values between variables in the data set.
Definition at line 6019 of file data_set.cpp.
Tensor< Correlation, 2 > calculate_input_target_columns_correlations | ( | ) | const |
Calculates the correlations between all outputs and all inputs. It returns a matrix with the data stored in CorrelationsResults format, where the number of rows is the input number and number of columns is the target number. Each element contains the correlation between a single input and a single target.
Definition at line 5876 of file data_set.cpp.
Tensor< Descriptives, 1 > calculate_input_variables_descriptives | ( | ) | const |
Returns a vector of Descriptives structures with some basic statistics of the input variables on the used This includes the minimum, maximum, mean and standard deviation. The size of this vector is the number of inputs.
Definition at line 5726 of file data_set.cpp.
Tensor< type, 1 > calculate_input_variables_maximums | ( | ) | const |
Returns a vector containing the maximums of the input variables.
Definition at line 5784 of file data_set.cpp.
Tensor< type, 1 > calculate_input_variables_minimums | ( | ) | const |
Returns a vector containing the minimums of the input variables.
Definition at line 5767 of file data_set.cpp.
Tensor< Index, 1 > calculate_isolation_forest_outliers | ( | const Index & | n_trees = 100 , |
const Index & | subs_set_samples = 256 , |
||
const type & | contamination = type(0) |
||
) | const |
Definition at line 9288 of file data_set.cpp.
|
private |
Definition at line 8751 of file data_set.cpp.
|
private |
Definition at line 8915 of file data_set.cpp.
|
private |
Definition at line 8971 of file data_set.cpp.
Tensor< Index, 1 > calculate_local_outlier_factor_outliers | ( | const Index & | k_neighbors = 20 , |
const Index & | min_samples_leaf = 0 , |
||
const type & | contamination = type(0) |
||
) | const |
Calculate the outliers from the data set using the LocalOutlierFactor method.
k_neighbors | Used to perform a k_nearest_algorithm to find the local density. Default is 20. |
min_samples_leaf | The minimum number of samples per leaf when building a KDTree. If 0, automatically decide between using brute force aproach or KDTree. If > samples_number/2, brute force aproach is performed. Default is 0. |
contamination | Percentage of outliers in the data_set to be selected. If 0.0, those paterns which deviates from the mean of LOF more than 2 times are considered outlier. Default is 0.0. |
Definition at line 9002 of file data_set.cpp.
|
private |
Definition at line 9069 of file data_set.cpp.
Index calculate_selection_negatives | ( | const Index & | target_index | ) | const |
Counts the number of negatives of the selected target in the selection data.
target_index | Index of the target to evaluate. |
Definition at line 5433 of file data_set.cpp.
Tensor< type, 1 > calculate_selection_targets_mean | ( | ) | const |
Returns the mean values of the target variables on the selection.
Definition at line 5843 of file data_set.cpp.
Tensor< Index, 1 > calculate_target_distribution | ( | ) | const |
Returns a vector containing the number of samples of each class in the data set. If the number of target variables is one then the number of classes is two. If the number of target variables is greater than one then the number of classes is equal to the number of target variables.
Definition at line 8480 of file data_set.cpp.
Tensor< Descriptives, 1 > calculate_target_variables_descriptives | ( | ) | const |
Returns a vector of vectors with some basic descriptives of the target variables on all The size of this vector is four. The subvectors are:
Definition at line 5745 of file data_set.cpp.
Tensor< type, 1 > calculate_target_variables_maximums | ( | ) | const |
Returns a vector containing the maximums of the target variables.
Definition at line 5792 of file data_set.cpp.
Tensor< type, 1 > calculate_target_variables_minimums | ( | ) | const |
Returns a vector containing the minimums of the target variables.
Definition at line 5775 of file data_set.cpp.
Index calculate_testing_negatives | ( | const Index & | target_index | ) | const |
Counts the number of negatives of the selected target in the testing data.
target_index | Index of the target to evaluate. |
Definition at line 5468 of file data_set.cpp.
Tensor< Descriptives, 1 > calculate_testing_target_variables_descriptives | ( | ) | const |
Definition at line 5755 of file data_set.cpp.
Index calculate_training_negatives | ( | const Index & | target_index | ) | const |
Counts the number of negatives of the selected target in the training data.
target_index | Index of the target to evaluate. |
Definition at line 5398 of file data_set.cpp.
|
private |
Definition at line 9227 of file data_set.cpp.
Tensor< Tensor< Index, 1 >, 1 > calculate_Tukey_outliers | ( | const type & | cleaning_parameter = type(1.5) | ) | const |
Calculate the outliers from the data set using the Tukey's test.
cleaning_parameter | Parameter used to detect outliers. |
Definition at line 8540 of file data_set.cpp.
Index calculate_used_negatives | ( | const Index & | target_index | ) | const |
Counts the number of used negatives of the selected target.
target_index | Index of the target to evaluate. |
Definition at line 5363 of file data_set.cpp.
Tensor< type, 1 > calculate_used_targets_mean | ( | ) | const |
Definition at line 5831 of file data_set.cpp.
Tensor< Descriptives, 1 > calculate_used_variables_descriptives | ( | ) | const |
Returns a vector of vectors containing some basic descriptives of the used variables and samples The size of this vector is four. The subvectors are:
Definition at line 5514 of file data_set.cpp.
Tensor< type, 1 > calculate_used_variables_minimums | ( | ) | const |
Returns a vector containing the maximum of the used variables.
Definition at line 5800 of file data_set.cpp.
Tensor< Descriptives, 1 > calculate_variables_descriptives | ( | ) | const |
Returns a vector of vectors containing some basic descriptives of all the variables in the data set. The size of this vector is four. The subvectors are:
Definition at line 5499 of file data_set.cpp.
Tensor< type, 1 > calculate_variables_means | ( | const Tensor< Index, 1 > & | variables_indices | ) | const |
Returns a vector containing the means of a set of given variables.
variables_indices | Indices of the variables. |
Definition at line 5809 of file data_set.cpp.
void check_constant_columns | ( | ) |
Definition at line 3566 of file data_set.cpp.
void check_input_csv | ( | const string & | input_data_file_name, |
const char & | separator_char | ||
) | const |
This method checks if the input data file has the correct format. Returns an error message.
Definition at line 8111 of file data_set.cpp.
void check_separators | ( | const string & | line | ) | const |
Definition at line 10571 of file data_set.cpp.
void check_special_characters | ( | const string & | line | ) | const |
Definition at line 10646 of file data_set.cpp.
Index count_nan | ( | ) | const |
Definition at line 10783 of file data_set.cpp.
Tensor< Index, 1 > count_nan_columns | ( | ) | const |
Definition at line 10729 of file data_set.cpp.
Index count_rows_with_nan | ( | ) | const |
Definition at line 10754 of file data_set.cpp.
|
private |
Definition at line 8816 of file data_set.cpp.
|
private |
Definition at line 9202 of file data_set.cpp.
|
private |
Definition at line 9140 of file data_set.cpp.
|
private |
Definition at line 8843 of file data_set.cpp.
Tensor< Index, 1 > filter_data | ( | const Tensor< type, 1 > & | minimums, |
const Tensor< type, 1 > & | maximums | ||
) |
Unuses those samples with values outside a defined range.
minimums | vector of minimum values in the range. The size must be equal to the number of variables. |
maximums | vector of maximum values in the range. The size must be equal to the number of variables. |
Definition at line 9662 of file data_set.cpp.
void fix_repeated_names | ( | ) |
Definition at line 10840 of file data_set.cpp.
void from_XML | ( | const tinyxml2::XMLDocument & | data_set_document | ) |
Definition at line 6855 of file data_set.cpp.
void generate_constant_data | ( | const Index & | samples_number, |
const Index & | variables_number, | ||
const type & | value | ||
) |
Generates an artificial data_set with a given number of samples and number of variables by constant data.
samples_number | Number of samples in the data_set. |
variables_number | Number of variables in the data_set. |
Definition at line 9558 of file data_set.cpp.
void generate_random_data | ( | const Index & | samples_number, |
const Index & | variables_number | ||
) |
Generates an artificial data_set with a given number of samples and number of variables using random data.
samples_number | Number of samples in the data_set. |
variables_number | Number of variables in the data_set. |
Definition at line 9574 of file data_set.cpp.
void generate_Rosenbrock_data | ( | const Index & | samples_number, |
const Index & | variables_number | ||
) |
Generates an artificial data_set with a given number of samples and number of variables using the Rosenbrock function.
samples_number | Number of samples in the data_set. |
variables_number | Number of variables in the data_set. |
Definition at line 9607 of file data_set.cpp.
void generate_sequential_data | ( | const Index & | samples_number, |
const Index & | variables_number | ||
) |
Generates an artificial data_set with a given number of samples and number of variables using a sequential data.
samples_number | Number of samples in the data_set. |
variables_number | Number of variables in the data_set. |
Definition at line 9587 of file data_set.cpp.
void generate_sum_data | ( | const Index & | samples_number, |
const Index & | variables_number | ||
) |
Definition at line 9637 of file data_set.cpp.
Tensor< Index, 2 > get_batches | ( | const Tensor< Index, 1 > & | samples_indices, |
const Index & | batch_samples_number, | ||
const bool & | shuffle, | ||
const Index & | new_buffer_size = 100 |
||
) | const |
Returns a vector, where each element is a vector that contains the indices of the different batches of the training samples.
shuffle | Is a boleean. If shuffle is true, then the indices are shuffled into batches, and false otherwise |
Definition at line 1217 of file data_set.cpp.
Tensor< type, 2 > get_column_data | ( | const Index & | column_index | ) | const |
Returns the data from the data set column with a given index, these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).
column_index | Index of the column. |
Definition at line 4289 of file data_set.cpp.
Tensor< type, 2 > get_column_data | ( | const Index & | column_index, |
const Tensor< Index, 1 > & | rows_indices | ||
) | const |
Returns the data from the data set column with a given index, these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).
column_index | Index of the column. |
rows_indices | Rows of the indices. |
Definition at line 4332 of file data_set.cpp.
Tensor< type, 2 > get_column_data | ( | const string & | column_name | ) | const |
Returns the data from the data set column with a given name, these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).
column_name | Name of the column. |
Definition at line 4342 of file data_set.cpp.
Index get_column_index | ( | const Index & | variable_index | ) | const |
Returns the index of the column to which a variable index belongs.
variable_index | Index of the variable to be found. |
Definition at line 4214 of file data_set.cpp.
Index get_column_index | ( | const string & | column_name | ) | const |
Returns the index of the column with the given name.
column_name | Name of the column to be found. |
Definition at line 4192 of file data_set.cpp.
|
inline |
Definition at line 244 of file data_set.h.
DataSet::VariableUse get_column_use | ( | const Index & | index | ) | const |
Returns a vector containing the use of the column, without taking into account the categories.
Definition at line 2000 of file data_set.cpp.
Tensor< DataSet::Column, 1 > get_columns | ( | ) | const |
Returns the columns of the data set.
Definition at line 2697 of file data_set.cpp.
Tensor< string, 1 > get_columns_names | ( | ) | const |
Returns a string vector that contains the names of the columns.
Definition at line 2473 of file data_set.cpp.
Index get_columns_number | ( | ) | const |
Returns the number of columns in the data set.
Definition at line 2800 of file data_set.cpp.
Tensor< Scaler, 1 > get_columns_scalers | ( | ) | const |
Definition at line 2408 of file data_set.cpp.
Tensor< DataSet::VariableUse, 1 > get_columns_uses | ( | ) | const |
Returns the uses of each columns of the data set.
Definition at line 2008 of file data_set.cpp.
const Tensor< type, 2 > & get_data | ( | ) | const |
Returns a reference to the data matrix in the data set. The number of rows is equal to the number of samples. The number of columns is equal to the number of variables.
Definition at line 3673 of file data_set.cpp.
const string & get_data_file_name | ( | ) | const |
Returns the name of the data file.
Definition at line 3704 of file data_set.cpp.
Tensor< Tensor< string, 1 >, 1 > get_data_file_preview | ( | ) | const |
Definition at line 4550 of file data_set.cpp.
Tensor< type, 2 > * get_data_pointer | ( | ) |
Definition at line 3679 of file data_set.cpp.
|
static |
Definition at line 9873 of file data_set.cpp.
const bool & get_display | ( | ) | const |
Returns true if messages from this class can be displayed on the screen, or false if messages from this class can't be displayed on the screen.
Definition at line 94 of file data_set.cpp.
Index get_gmt | ( | ) | const |
Returns the value of the gmt that has the data set, by default it is 0. This is recommended to use in forecasting problems.
Definition at line 5856 of file data_set.cpp.
bool get_has_rows_labels | ( | ) | const |
Definition at line 11149 of file data_set.cpp.
const bool & get_header_line | ( | ) | const |
Returns true if the first line of the data file has a header with the names of the variables, and false otherwise.
Definition at line 3712 of file data_set.cpp.
Tensor< DataSet::Column, 1 > get_input_columns | ( | ) | const |
Returns the input columns of the data set.
Definition at line 2717 of file data_set.cpp.
Tensor< bool, 1 > get_input_columns_binary | ( | ) | const |
Returns the input columns of the data set.
Definition at line 2739 of file data_set.cpp.
Tensor< Index, 1 > get_input_columns_indices | ( | ) | const |
Returns a indices vector with the positions of the inputs.
Definition at line 2271 of file data_set.cpp.
Tensor< string, 1 > get_input_columns_names | ( | ) | const |
Returns a string vector that contains the names of the columns whose uses are Input.
Definition at line 2505 of file data_set.cpp.
Index get_input_columns_number | ( | ) | const |
Returns the number of columns whose uses are Input.
Definition at line 2575 of file data_set.cpp.
Tensor< type, 2 > get_input_data | ( | ) | const |
Returns a matrix with the input variables in the data set. The number of rows is the number of The number of columns is the number of input variables.
Definition at line 3955 of file data_set.cpp.
Tensor< type, 2 > get_input_data | ( | const Tensor< Index, 1 > & | samples_indices | ) | const |
Returns a tensor with the input variables in the data set. The number of rows is the number of The number of columns is the number of input variables.
Definition at line 3986 of file data_set.cpp.
Tensor< Index, 1 > get_input_time_series_columns_indices | ( | ) | const |
Definition at line 2292 of file data_set.cpp.
Index get_input_time_series_columns_number | ( | ) | const |
Definition at line 2591 of file data_set.cpp.
const Tensor< Index, 1 > & get_input_variables_dimensions | ( | ) | const |
Returns the dimensions of the input variables.
Definition at line 2245 of file data_set.cpp.
Tensor< Index, 1 > get_input_variables_indices | ( | ) | const |
Returns the indices of the input variables.
Definition at line 3047 of file data_set.cpp.
Tensor< string, 1 > get_input_variables_names | ( | ) | const |
Returns the names of the input variables in the data set. The size of the vector is the number of input variables.
Definition at line 2184 of file data_set.cpp.
Index get_input_variables_number | ( | ) | const |
Returns the number of input variables of the data set. Note that the number of variables does not have to equal the number of columns in the data set, because OpenNN recognizes the categorical columns, separating these categories into variables of the data set.
Definition at line 2859 of file data_set.cpp.
Index get_input_variables_rank | ( | ) | const |
Definition at line 2251 of file data_set.cpp.
Tensor< Scaler, 1 > get_input_variables_scalers | ( | ) | const |
Definition at line 2423 of file data_set.cpp.
|
private |
Definition at line 8792 of file data_set.cpp.
const Index & get_lags_number | ( | ) | const |
Returns the number of lags to be used in a time series prediction application.
Definition at line 3825 of file data_set.cpp.
const string & get_missing_values_label | ( | ) | const |
Returns the string which will be used as label for the missing values in the data file.
Definition at line 3817 of file data_set.cpp.
DataSet::MissingValuesMethod get_missing_values_method | ( | ) | const |
Returns a string with the method used.
Definition at line 3696 of file data_set.cpp.
const bool & get_rows_label | ( | ) | const |
Returns true if the data file has rows label, and false otherwise.
Definition at line 3720 of file data_set.cpp.
Tensor< string, 1 > get_rows_label_tensor | ( | ) | const |
Definition at line 3726 of file data_set.cpp.
Tensor< type, 1 > get_sample_data | ( | const Index & | index | ) | const |
Returns the inputs and target values of a single sample in the data set.
index | Index of the sample. |
Definition at line 4093 of file data_set.cpp.
Tensor< type, 1 > get_sample_data | ( | const Index & | sample_index, |
const Tensor< Index, 1 > & | variables_indices | ||
) | const |
Returns the inputs and target values of a single sample in the data set.
sample_index | Index of the sample. |
variables_indices | Indices of the variables. |
Definition at line 4123 of file data_set.cpp.
Tensor< type, 2 > get_sample_input_data | ( | const Index & | sample_index | ) | const |
Returns the inputs values of a single sample in the data set.
sample_index | Index of the sample. |
Definition at line 4163 of file data_set.cpp.
string get_sample_string | ( | const Index & | sample_index, |
const string & | separator = "," |
||
) | const |
Returns a string with the values of the sample corresponding to the given index. The values will be separated by the given separator char.
sample_index | Index of the sample. |
separator | Separator. |
Definition at line 1006 of file data_set.cpp.
Tensor< type, 2 > get_sample_target_data | ( | const Index & | sample_index | ) | const |
Returns the target values of a single sample in the data set.
sample_index | Index of the sample. |
Definition at line 4181 of file data_set.cpp.
DataSet::SampleUse get_sample_use | ( | const Index & | index | ) | const |
Returns the use of a single sample.
index | Sample index. |
Definition at line 1199 of file data_set.cpp.
|
inline |
Definition at line 184 of file data_set.h.
const Tensor< DataSet::SampleUse, 1 > & get_samples_uses | ( | ) | const |
Returns the use of every sample (training, selection, testing or unused) in a vector.
Definition at line 1207 of file data_set.cpp.
Tensor< Index, 1 > get_samples_uses_numbers | ( | ) | const |
Returns a vector with the number of training, selection, testing and unused samples. The size of that vector is therefore four.
Definition at line 943 of file data_set.cpp.
Tensor< type, 1 > get_samples_uses_percentages | ( | ) | const |
Returns a vector with the uses of the samples in percentages of the data set. Uses: training, selection, testing and unused samples. Note that the vector size is four.
Definition at line 977 of file data_set.cpp.
|
static |
Returns a value of the scaling-unscaling method enumeration from a string containing the name of that method.
scaling_unscaling_method | String with the name of the scaling and unscaling method. |
Definition at line 3861 of file data_set.cpp.
Tensor< type, 2 > get_selection_data | ( | ) | const |
Returns a matrix with the selection samples in the data set. The number of rows is the number of selection The number of columns is the number of variables.
Definition at line 3921 of file data_set.cpp.
Tensor< type, 2 > get_selection_input_data | ( | ) | const |
Returns a tensor with selection samples and input variables. The number of rows is the number of selection The number of columns is the number of input variables.
Definition at line 4038 of file data_set.cpp.
Tensor< string, 1 > get_selection_rows_label_tensor | ( | ) |
Definition at line 3746 of file data_set.cpp.
Tensor< Index, 1 > get_selection_samples_indices | ( | ) | const |
Returns the indices of the samples which will be used for selection.
Definition at line 1098 of file data_set.cpp.
Index get_selection_samples_number | ( | ) | const |
Returns the number of samples in the data set which will be used for selection.
Definition at line 1402 of file data_set.cpp.
Tensor< type, 2 > get_selection_target_data | ( | ) | const |
Returns a tensor with selection samples and target variables. The number of rows is the number of selection The number of columns is the number of target variables.
Definition at line 4052 of file data_set.cpp.
const DataSet::Separator & get_separator | ( | ) | const |
Returns the separator to be used in the data file.
Definition at line 3763 of file data_set.cpp.
char get_separator_char | ( | ) | const |
Returns the string which will be used as separator in the data file.
Definition at line 3771 of file data_set.cpp.
string get_separator_string | ( | ) | const |
Returns the string which will be used as separator in the data file.
Definition at line 3794 of file data_set.cpp.
const Index & get_steps_ahead | ( | ) | const |
Returns the number of steps ahead to be used in a time series prediction application.
Definition at line 3833 of file data_set.cpp.
Tensor< type, 2 > get_subtensor_data | ( | const Tensor< Index, 1 > & | rows_indices, |
const Tensor< Index, 1 > & | variables_indices | ||
) | const |
Definition at line 4556 of file data_set.cpp.
Tensor< DataSet::Column, 1 > get_target_columns | ( | ) | const |
Returns the target columns of the data set.
Definition at line 2759 of file data_set.cpp.
Tensor< Index, 1 > get_target_columns_indices | ( | ) | const |
Returns a indices vector with the positions of the targets.
Definition at line 2315 of file data_set.cpp.
Tensor< string, 1 > get_target_columns_names | ( | ) | const |
Returns a string vector which contains the names of the columns whose uses are Target.
Definition at line 2528 of file data_set.cpp.
Index get_target_columns_number | ( | ) | const |
Returns the number of columns whose uses are Target.
Definition at line 2609 of file data_set.cpp.
Tensor< type, 2 > get_target_data | ( | ) | const |
Returns a matrix with the target variables in the data set. The number of rows is the number of The number of columns is the number of target variables.
Definition at line 3972 of file data_set.cpp.
Tensor< type, 2 > get_target_data | ( | const Tensor< Index, 1 > & | samples_indices | ) | const |
Returns a tensor with the target variables in the data set. The number of rows is the number of The number of columns is the number of input variables.
Definition at line 3998 of file data_set.cpp.
Tensor< Index, 1 > get_target_time_series_columns_indices | ( | ) | const |
Definition at line 2336 of file data_set.cpp.
Index get_target_time_series_columns_number | ( | ) | const |
Definition at line 2625 of file data_set.cpp.
Tensor< Index, 1 > get_target_variables_indices | ( | ) | const |
Returns the indices of the target variables.
Definition at line 3094 of file data_set.cpp.
Tensor< string, 1 > get_target_variables_names | ( | ) | const |
Returns the names of the target variables in the data set. The size of the vector is the number of target variables.
Definition at line 2215 of file data_set.cpp.
Index get_target_variables_number | ( | ) | const |
Returns the number of target variables of the data set.
Definition at line 2884 of file data_set.cpp.
Tensor< Scaler, 1 > get_target_variables_scalers | ( | ) | const |
Definition at line 2447 of file data_set.cpp.
Tensor< type, 2 > get_testing_data | ( | ) | const |
Returns a matrix with the testing samples in the data set. The number of rows is the number of testing The number of columns is the number of variables.
Definition at line 3938 of file data_set.cpp.
Tensor< type, 2 > get_testing_input_data | ( | ) | const |
Returns a tensor with testing samples and input variables. The number of rows is the number of testing The number of columns is the number of input variables.
Definition at line 4066 of file data_set.cpp.
Tensor< string, 1 > get_testing_rows_label_tensor | ( | ) |
Definition at line 3731 of file data_set.cpp.
Tensor< Index, 1 > get_testing_samples_indices | ( | ) | const |
Returns the indices of the samples which will be used for testing.
Definition at line 1123 of file data_set.cpp.
Index get_testing_samples_number | ( | ) | const |
Returns the number of samples in the data set which will be used for testing.
Definition at line 1422 of file data_set.cpp.
Tensor< type, 2 > get_testing_target_data | ( | ) | const |
Returns a tensor with testing samples and target variables. The number of rows is the number of testing The number of columns is the number of target variables.
Definition at line 4080 of file data_set.cpp.
const string & get_time_column | ( | ) | const |
Returns the indices of the time variables in the data set.
Definition at line 3841 of file data_set.cpp.
Index get_time_columns_number | ( | ) | const |
Returns the number of columns whose uses are Time.
Definition at line 2643 of file data_set.cpp.
Tensor< type, 2 > get_time_series_column_data | ( | const Index & | column_index | ) | const |
Returns the data from the time series column with a given index,
column_index | Index of the column. |
Definition at line 4309 of file data_set.cpp.
Tensor< DataSet::Column, 1 > get_time_series_columns | ( | ) | const |
Definition at line 2703 of file data_set.cpp.
Tensor< string, 1 > get_time_series_columns_names | ( | ) | const |
Definition at line 2488 of file data_set.cpp.
Index get_time_series_columns_number | ( | ) | const |
Returns the number of columns in the time series.
Definition at line 2807 of file data_set.cpp.
const Tensor< type, 2 > & get_time_series_data | ( | ) | const |
Returns a reference to the time series data matrix in the data set. Only for time series problems.
Definition at line 3688 of file data_set.cpp.
Index get_time_series_data_rows_number | ( | ) | const |
Definition at line 2709 of file data_set.cpp.
Index get_time_series_time_column_index | ( | ) | const |
Definition at line 3847 of file data_set.cpp.
Tensor< string, 1 > get_time_series_variables_names | ( | ) | const |
Returns a string vector with the names of all the variables in the time series data. The size of the vector is the number of variables.
Definition at line 2151 of file data_set.cpp.
Index get_time_series_variables_number | ( | ) | const |
Returns the number of variables in the time series data.
Definition at line 2835 of file data_set.cpp.
Tensor< type, 2 > get_training_data | ( | ) | const |
Returns a matrix with the training samples in the data set. The number of rows is the number of training The number of columns is the number of variables.
Definition at line 3900 of file data_set.cpp.
Tensor< type, 2 > get_training_input_data | ( | ) | const |
Returns a matrix with training samples and input variables. The number of rows is the number of training The number of columns is the number of input variables.
Definition at line 4010 of file data_set.cpp.
Tensor< Index, 1 > get_training_samples_indices | ( | ) | const |
Returns the indices of the samples which will be used for training.
Definition at line 1073 of file data_set.cpp.
Index get_training_samples_number | ( | ) | const |
Returns the number of samples in the data set which will be used for training.
Definition at line 1382 of file data_set.cpp.
Tensor< type, 2 > get_training_target_data | ( | ) | const |
Returns a tensor with training samples and target variables. The number of rows is the number of training The number of columns is the number of target variables.
Definition at line 4024 of file data_set.cpp.
Tensor< Index, 1 > get_unused_columns_indices | ( | ) | const |
Returns a indices vector with the positions of the unused columns.
Definition at line 2359 of file data_set.cpp.
Index get_unused_columns_number | ( | ) | const |
Returns the number of columns that are not used.
Definition at line 2661 of file data_set.cpp.
Tensor< Index, 1 > get_unused_samples_indices | ( | ) | const |
Returns the indices of the samples set unused.
Definition at line 1173 of file data_set.cpp.
Index get_unused_samples_number | ( | ) | const |
Returns the number of samples in the data set which will neither be used for training, selection or testing.
Definition at line 1455 of file data_set.cpp.
Tensor< Index, 1 > get_unused_variables_indices | ( | ) | const |
Returns the indices of the unused variables.
Definition at line 2956 of file data_set.cpp.
Index get_unused_variables_number | ( | ) | const |
Returns the number of variables which will neither be used as input nor as target.
Definition at line 2910 of file data_set.cpp.
Tensor< DataSet::Column, 1 > get_used_columns | ( | ) | const |
Returns the used columns of the data set.
Definition at line 2781 of file data_set.cpp.
Tensor< Index, 1 > get_used_columns_indices | ( | ) | const |
Returns a indices vector with the positions of the used columns.
Definition at line 2383 of file data_set.cpp.
Tensor< string, 1 > get_used_columns_names | ( | ) | const |
Returns a string vector which contains the names of the columns used whether Input, Target or Time.
Definition at line 2551 of file data_set.cpp.
Index get_used_columns_number | ( | ) | const |
Returns the number of columns that are used.
Definition at line 2679 of file data_set.cpp.
Tensor< Index, 1 > get_used_samples_indices | ( | ) | const |
Returns the indices of the used samples(those which are not set unused).
Definition at line 1148 of file data_set.cpp.
Index get_used_samples_number | ( | ) | const |
Returns the total number of training, selection and testing samples, i.e. those which are not "Unused".
Definition at line 1443 of file data_set.cpp.
Tensor< Index, 1 > get_used_variables_indices | ( | ) | const |
Returns the indices of the used variables.
Definition at line 3002 of file data_set.cpp.
Index get_used_variables_number | ( | ) | const |
Returns the number of variables which are either input nor target.
Definition at line 2259 of file data_set.cpp.
Tensor< type, 1 > get_variable_data | ( | const Index & | index | ) | const |
Returns all the samples of a single variable in the data set.
index | Index of the variable. |
Definition at line 4353 of file data_set.cpp.
Tensor< type, 1 > get_variable_data | ( | const Index & | variable_index, |
const Tensor< Index, 1 > & | samples_indices | ||
) | const |
Returns a given set of samples of a single variable in the data set.
variable_index | Index of the variable. |
samples_indices | Indices of the |
Definition at line 4442 of file data_set.cpp.
Tensor< type, 1 > get_variable_data | ( | const string & | variable_name | ) | const |
Returns all the samples of a single variable in the data set.
variable_name | Name of the variable. |
Definition at line 4380 of file data_set.cpp.
Tensor< type, 1 > get_variable_data | ( | const string & | variable_name, |
const Tensor< Index, 1 > & | samples_indices | ||
) | const |
Returns a given set of samples of a single variable in the data set.
variable_name | Name of the variable. |
samples_indices | Indices of the |
Definition at line 4481 of file data_set.cpp.
Index get_variable_index | ( | const string & | name | ) | const |
Returns a variable index in the data set with given name.
name | Name of variable. |
Definition at line 2937 of file data_set.cpp.
Tensor< Index, 1 > get_variable_indices | ( | const Index & | column_index | ) | const |
Returns the indices of a variable in the data set. Note that the number of variables does not have to equal the number of columns in the data set, because OpenNN recognizes the categorical columns, separating these categories into variables of the data set.
Definition at line 4248 of file data_set.cpp.
string get_variable_name | ( | const Index & | variable_index | ) | const |
Returns the name of a single variable in the data set.
index | Index of variable. |
Definition at line 2059 of file data_set.cpp.
DataSet::VariableUse get_variable_use | ( | const Index & | index | ) | const |
Returns the use of a single variable.
index | Index of variable. |
Definition at line 1992 of file data_set.cpp.
Tensor< string, 1 > get_variables_names | ( | ) | const |
Returns a string vector with the names of all the variables in the data set. The size of the vector is the number of variables.
Definition at line 2118 of file data_set.cpp.
Index get_variables_number | ( | ) | const |
Returns the number of variables in the data set.
Definition at line 2814 of file data_set.cpp.
Tensor< DataSet::VariableUse, 1 > get_variables_uses | ( | ) | const |
Returns a vector containing the use of each column, including the categories. The size of the vector is equal to the number of variables.
Definition at line 2026 of file data_set.cpp.
bool has_binary_columns | ( | ) | const |
Definition at line 10668 of file data_set.cpp.
bool has_categorical_columns | ( | ) | const |
Definition at line 10681 of file data_set.cpp.
bool has_nan | ( | ) | const |
Returns true if the data contain missing values.
Definition at line 5912 of file data_set.cpp.
bool has_nan_row | ( | const Index & | row_index | ) | const |
Returns true if the given row contains missing values.
Definition at line 5922 of file data_set.cpp.
bool has_selection | ( | ) | const |
Definition at line 10721 of file data_set.cpp.
bool has_time_columns | ( | ) | const |
Definition at line 10694 of file data_set.cpp.
bool has_time_time_series_columns | ( | ) | const |
Definition at line 10707 of file data_set.cpp.
void impute_missing_values_mean | ( | ) |
Substitutes all the missing values by the mean of the corresponding variable.
Definition at line 9768 of file data_set.cpp.
void impute_missing_values_median | ( | ) |
Substitutes all the missing values by the median of the corresponding variable.
Definition at line 9802 of file data_set.cpp.
void impute_missing_values_unuse | ( | ) |
Sets all the samples with missing values to "Unused".
Definition at line 9753 of file data_set.cpp.
void initialize_sequential | ( | Tensor< Index, 1 > & | new_tensor, |
const Index & | start, | ||
const Index & | step, | ||
const Index & | end | ||
) | const |
Definition at line 10947 of file data_set.cpp.
void intialize_sequential | ( | Tensor< type, 1 > & | new_tensor, |
const type & | start, | ||
const type & | step, | ||
const type & | end | ||
) | const |
Definition at line 10964 of file data_set.cpp.
bool is_empty | ( | ) | const |
Returns true if the data matrix is empty, and false otherwise.
Definition at line 3658 of file data_set.cpp.
bool is_sample_unused | ( | const Index & | index | ) | const |
Returns true if a given sample is to be unused and false in other case.
index | Sample index. |
Definition at line 926 of file data_set.cpp.
bool is_sample_used | ( | const Index & | index | ) | const |
Returns true if a given sample is to be used for training, selection or testing, and false if it is to be unused.
index | Sample index. |
Definition at line 910 of file data_set.cpp.
void load | ( | const string & | file_name | ) |
Loads the members of a data set object from a XML-type file:
Please mind about the file format. This is specified in the User's Guide.
file_name | Name of data set XML-type file. |
Definition at line 7716 of file data_set.cpp.
void load_data_binary | ( | ) |
This method loads the data from a binary data file.
Definition at line 8023 of file data_set.cpp.
void load_time_series_data_binary | ( | const string & | time_series_data_file_name | ) |
This method loads time series data from a binary data.
Definition at line 8067 of file data_set.cpp.
void print | ( | ) | const |
Prints to the screen in text format the main numbers from the data set object.
Definition at line 7664 of file data_set.cpp.
void print_columns | ( | ) | const |
Definition at line 7735 of file data_set.cpp.
void print_columns_types | ( | ) | const |
Definition at line 7749 of file data_set.cpp.
void print_columns_uses | ( | ) | const |
Definition at line 7766 of file data_set.cpp.
void print_data | ( | ) | const |
Prints to the screen the values of the data matrix.
Definition at line 7783 of file data_set.cpp.
void print_data_file_preview | ( | ) | const |
Definition at line 6072 of file data_set.cpp.
void print_data_preview | ( | ) | const |
Prints to the scross_entropy_errorn a preview of the data matrix, i.e., the first, second and last samples
Definition at line 7792 of file data_set.cpp.
void print_input_target_columns_correlations | ( | ) | const |
Print on screen the correlation between targets and inputs.
Definition at line 5960 of file data_set.cpp.
void print_inputs_correlations | ( | ) | const |
Print on screen the correlation between variables in the data set.
Definition at line 6064 of file data_set.cpp.
void print_missing_values_information | ( | ) | const |
Print on screen the information about the missing values in the data set.
Definition at line 5940 of file data_set.cpp.
void print_top_input_target_columns_correlations | ( | ) | const |
This method print on screen the corretaliont between inputs and targets.
number | Number of variables to be printed. |
Definition at line 5983 of file data_set.cpp.
void print_top_inputs_correlations | ( | ) | const |
This method print on screen the corretaliont between variables.
number | Number of variables to be printed. |
Definition at line 6091 of file data_set.cpp.
Tensor< Index, 1 > push_back | ( | const Tensor< Index, 1 > & | old_vector, |
const Index & | new_string | ||
) | const |
Definition at line 10915 of file data_set.cpp.
Tensor< string, 1 > push_back | ( | const Tensor< string, 1 > & | old_vector, |
const string & | new_string | ||
) | const |
Definition at line 10931 of file data_set.cpp.
void read_csv | ( | ) |
Definition at line 9853 of file data_set.cpp.
void read_csv_1 | ( | ) |
Definition at line 9890 of file data_set.cpp.
void read_csv_2_complete | ( | ) |
Definition at line 10253 of file data_set.cpp.
void read_csv_2_simple | ( | ) |
Definition at line 10050 of file data_set.cpp.
void read_csv_3_complete | ( | ) |
Definition at line 10395 of file data_set.cpp.
void read_csv_3_simple | ( | ) |
Definition at line 10140 of file data_set.cpp.
Tensor< type, 2 > read_input_csv | ( | const string & | input_data_file_name, |
const char & | separator_char, | ||
const string & | missing_values_label, | ||
const bool & | has_columns_name, | ||
const bool & | has_rows_label | ||
) | const |
This method loads data from a file and returns a matrix containing the input columns.
Definition at line 8182 of file data_set.cpp.
void save | ( | const string & | file_name | ) | const |
Saves the members of a data set object to a XML-type file in an XML-type format.
file_name | Name of data set XML-type file. |
Definition at line 7681 of file data_set.cpp.
void save_data | ( | ) | const |
Saves to the data file the values of the data matrix.
Definition at line 7844 of file data_set.cpp.
void save_data_binary | ( | const string & | binary_data_file_name | ) | const |
Saves to the data file the values of the data matrix in binary format.
Definition at line 7909 of file data_set.cpp.
void save_time_series_data_binary | ( | const string & | binary_data_file_name | ) | const |
Saves to the data file the values of the time series data matrix in binary format.
Definition at line 7958 of file data_set.cpp.
Tensor< Descriptives, 1 > scale_data | ( | ) |
Definition at line 6146 of file data_set.cpp.
Tensor< Descriptives, 1 > scale_input_variables | ( | ) |
It scales every input variable with the given method. The method to be used is that in the scaling and unscaling method variable.
Definition at line 6243 of file data_set.cpp.
Tensor< Descriptives, 1 > scale_target_variables | ( | ) |
Calculates the input and target variables descriptives. Then it scales the target variables with those values. The method to be used is that in the scaling and unscaling method variable. Finally, it returns the descriptives.
Definition at line 6298 of file data_set.cpp.
void scrub_missing_values | ( | ) |
General method for dealing with missing values. It switches among the different scrubbing methods available, according to the corresponding value in the missing values object.
Definition at line 9828 of file data_set.cpp.
|
private |
Definition at line 8634 of file data_set.cpp.
|
private |
Definition at line 8673 of file data_set.cpp.
void set | ( | ) |
Sets zero samples and zero variables in the data set.
Definition at line 4586 of file data_set.cpp.
void set | ( | const DataSet & | other_data_set | ) |
Sets the members of this data set object with those from another data set object.
other_data_set | Data set object to be copied. |
Definition at line 4743 of file data_set.cpp.
void set | ( | const Index & | new_samples_number, |
const Index & | new_variables_number | ||
) |
Sets new numbers of samples and variables in the inputs targets data set. All the samples are set for training. All the variables are set as inputs.
new_samples_number | Number of |
new_variables_number | Number of variables. |
Definition at line 4649 of file data_set.cpp.
void set | ( | const Index & | new_samples_number, |
const Index & | new_inputs_number, | ||
const Index & | new_targets_number | ||
) |
Sets new numbers of samples and inputs and target variables in the data set. The variables in the data set are the number of inputs plus the number of targets.
new_samples_number | Number of |
new_inputs_number | Number of input variables. |
new_targets_number | Number of target variables. |
Definition at line 4703 of file data_set.cpp.
void set | ( | const string & | file_name | ) |
Sets the data set members by loading them from a XML file.
file_name | Data set XML file_name. |
Definition at line 4775 of file data_set.cpp.
void set | ( | const string & | data_file_name, |
const char & | separator, | ||
const bool & | new_has_columns_names | ||
) |
Definition at line 4606 of file data_set.cpp.
void set | ( | const Tensor< type, 2 > & | new_data | ) |
Sets all variables from a data matrix.
new_data | Data matrix. |
Definition at line 4628 of file data_set.cpp.
void set | ( | const tinyxml2::XMLDocument & | data_set_document | ) |
Sets the data set members from a XML document.
data_set_document | TinyXML document containing the member data. |
Definition at line 4764 of file data_set.cpp.
void set_binary_simple_columns | ( | ) |
Definition at line 3483 of file data_set.cpp.
void set_column_name | ( | const Index & | column_index, |
const string & | new_name | ||
) |
Sets the name of a single column.
index | Index of column. |
new_use | Use for that column. |
Definition at line 1983 of file data_set.cpp.
void set_column_type | ( | const Index & | index, |
const ColumnType & | new_type | ||
) |
Definition at line 3277 of file data_set.cpp.
void set_column_type | ( | const string & | name, |
const ColumnType & | new_type | ||
) |
Definition at line 3283 of file data_set.cpp.
void set_column_use | ( | const Index & | index, |
const VariableUse & | new_use | ||
) |
Sets the use of a single column.
index | Index of column. |
new_use | Use for that column. |
Definition at line 3255 of file data_set.cpp.
void set_column_use | ( | const string & | name, |
const VariableUse & | new_use | ||
) |
Sets the use of a single column.
name | Name of column. |
new_use | Use for that column. |
Definition at line 3270 of file data_set.cpp.
void set_columns | ( | const Tensor< Column, 1 > & | new_columns | ) |
Definition at line 1912 of file data_set.cpp.
void set_columns_missing_values_number | ( | ) |
Definition at line 10822 of file data_set.cpp.
void set_columns_missing_values_number | ( | const Tensor< Index, 1 > & | new_columns_missing_values_number | ) |
Definition at line 10816 of file data_set.cpp.
void set_columns_names | ( | const Tensor< string, 1 > & | new_names | ) |
Sets new names for the columns in the data set from a vector of strings. The size of that vector must be equal to the total number of variables.
new_names | Name of variables. |
Definition at line 3403 of file data_set.cpp.
void set_columns_number | ( | const Index & | new_columns_number | ) |
Sets a new number of variables in the variables object. All variables are set as inputs but the last one, which is set as targets.
new_columns_number | Number of variables. |
Definition at line 3465 of file data_set.cpp.
void set_columns_scalers | ( | const Scaler & | scalers | ) |
Definition at line 3473 of file data_set.cpp.
void set_columns_unused | ( | ) |
Sets all columns in the data_set as unused columns.
Definition at line 3200 of file data_set.cpp.
void set_columns_uses | ( | const Tensor< string, 1 > & | new_columns_uses | ) |
Sets the uses of the data set columns.
new_columns_uses | String vector that contains the new uses to be set, note that this vector needs to be the size of the number of columns in the data set. |
Definition at line 3142 of file data_set.cpp.
void set_columns_uses | ( | const Tensor< VariableUse, 1 > & | new_columns_uses | ) |
Sets the uses of the data set columns.
new_columns_uses | DataSet::VariableUse vector that contains the new uses to be set, note that this vector needs to be the size of the number of columns in the data set. |
Definition at line 3173 of file data_set.cpp.
void set_data | ( | const Tensor< type, 2 > & | new_data | ) |
Sets a new data matrix. The number of rows must be equal to the number of The number of columns must be equal to the number of variables. Indices of all training, selection and testing samples and inputs and target variables do not change.
new_data | Data matrix. |
Definition at line 4831 of file data_set.cpp.
void set_data_binary_random | ( | ) |
Initializes the data matrix with random values chosen from a uniform distribution with given minimum and maximum. The targets will be binary randoms.
Definition at line 6458 of file data_set.cpp.
void set_data_constant | ( | const type & | new_value | ) |
Initializes the data matrix with a given value.
new_value | Initialization value. |
Definition at line 6440 of file data_set.cpp.
void set_data_file_name | ( | const string & | new_data_file_name | ) |
Sets the name of the data file. It also loads the data from that file. Moreover, it sets the variables and samples objects.
new_data_file_name | Name of the file containing the data. |
Definition at line 4858 of file data_set.cpp.
void set_data_random | ( | ) |
Initializes the data matrix with random values chosen from a uniform distribution with given minimum and maximum.
Definition at line 6449 of file data_set.cpp.
void set_default | ( | ) |
void set_default_columns_names | ( | ) |
This method puts the names of the columns in the data_set. This is used when the data_set does not have a header, the default names are: column_0, column_1, ..., column_n.
Definition at line 1968 of file data_set.cpp.
void set_default_columns_scalers | ( | ) |
Returns a vector of strings containing the scaling method that best fits each of the input variables.
Definition at line 6128 of file data_set.cpp.
void set_default_columns_uses | ( | ) |
This method sets the n columns of the data_set by default, i.e. until column n-1 are Input and column n is Target.
Definition at line 1921 of file data_set.cpp.
void set_display | ( | const bool & | new_display | ) |
Sets a new display value. If it is set to true messages from this class are to be displayed on the screen; if it is set to false messages from this class are not to be displayed on the screen.
new_display | Display value. |
Definition at line 4785 of file data_set.cpp.
void set_gmt | ( | Index & | new_gmt | ) |
Sets the value of the gmt, by default it is 0. This is recommended to use in forecasting problems.
Definition at line 5865 of file data_set.cpp.
void set_has_columns_names | ( | const bool & | new_has_columns_names | ) |
Sets if the data file contains a header with the names of the columns.
Definition at line 4866 of file data_set.cpp.
void set_has_rows_label | ( | const bool & | new_has_rows_label | ) |
Sets if the data file contains rows label.
Definition at line 4874 of file data_set.cpp.
void set_input | ( | ) |
Sets all the variables in the data set as input variables.
Definition at line 3428 of file data_set.cpp.
void set_input_columns | ( | const Tensor< Index, 1 > & | input_columns_indices, |
const Tensor< bool, 1 > & | input_columns_use | ||
) |
Definition at line 3241 of file data_set.cpp.
void set_input_columns_unused | ( | ) |
Sets all input columns in the data_set as unused columns.
Definition at line 3229 of file data_set.cpp.
void set_input_target_columns | ( | const Tensor< Index, 1 > & | input_columns, |
const Tensor< Index, 1 > & | target_columns | ||
) |
Definition at line 3211 of file data_set.cpp.
void set_input_variables_dimensions | ( | const Tensor< Index, 1 > & | new_inputs_dimensions | ) |
Sets new input dimensions in the data set.
Definition at line 3650 of file data_set.cpp.
void set_lags_number | ( | const Index & | new_lags_number | ) |
Sets a new number of lags to be defined for a time series prediction application. When loading the data file, the time series data will be modified according to this number.
new_lags_number | Number of lags(x-1, ..., x-l) to be used. |
Definition at line 5022 of file data_set.cpp.
void set_missing_values_label | ( | const string & | new_missing_values_label | ) |
Sets a new label for the missing values.
new_missing_values_label | Label for the missing values. |
Definition at line 4961 of file data_set.cpp.
void set_missing_values_method | ( | const MissingValuesMethod & | new_missing_values_method | ) |
Sets a new method for the missing values.
new_missing_values_method | Method for the missing values. |
Definition at line 4985 of file data_set.cpp.
void set_missing_values_method | ( | const string & | new_missing_values_method | ) |
Definition at line 4991 of file data_set.cpp.
void set_missing_values_number | ( | ) |
Definition at line 10810 of file data_set.cpp.
void set_missing_values_number | ( | const Index & | new_missing_values_number | ) |
Definition at line 10804 of file data_set.cpp.
void set_rows_missing_values_number | ( | ) |
Definition at line 10834 of file data_set.cpp.
void set_rows_missing_values_number | ( | const Index & | new_rows_missing_values_number | ) |
Definition at line 10828 of file data_set.cpp.
void set_sample_use | ( | const Index & | index, |
const SampleUse & | new_use | ||
) |
Sets the use of a single sample.
index | Index of sample. |
new_use | Use for that sample. |
Definition at line 1591 of file data_set.cpp.
void set_sample_use | ( | const Index & | index, |
const string & | new_use | ||
) |
Sets the use of a single sample from a string.
index | Index of sample. |
new_use | String with the use name("Training", "Selection", "Testing" or "Unused") |
Definition at line 1602 of file data_set.cpp.
void set_samples_number | ( | const Index & | new_samples_number | ) |
Sets a new number of samples in the data set. All samples are also set for training. The indices of the inputs and target variables do not change.
new_samples_number | Number of samples. |
Definition at line 5062 of file data_set.cpp.
void set_samples_unused | ( | ) |
Sets all the samples in the data set for unused.
Definition at line 1562 of file data_set.cpp.
void set_samples_unused | ( | const Tensor< Index, 1 > & | indices | ) |
Sets samples with given indices in the data set for unused.
indices | Indices vector with the index of samples in the data set for unused. |
Definition at line 1576 of file data_set.cpp.
void set_samples_uses | ( | const Tensor< SampleUse, 1 > & | new_uses | ) |
Sets new uses to all the samples from a single vector.
new_uses | vector of use structures. The size of given vector must be equal to the number of samples. |
Definition at line 1637 of file data_set.cpp.
void set_samples_uses | ( | const Tensor< string, 1 > & | new_uses | ) |
Sets new uses to all the samples from a single vector of strings.
new_uses | vector of use strings. Possible values for the elements are "Training", "Selection", "Testing" and "Unused". The size of given vector must be equal to the number of samples. |
Definition at line 1670 of file data_set.cpp.
void set_selection | ( | ) |
Sets all the samples in the data set for selection.
Definition at line 1488 of file data_set.cpp.
void set_selection | ( | const Tensor< Index, 1 > & | indices | ) |
Sets samples with given indices in the data set for selection.
indices | Indices vector with the index of samples in the data set for selection. |
Definition at line 1531 of file data_set.cpp.
void set_separator | ( | const char & | new_separator | ) |
Sets a new separator from a char.
new_separator | Char with the separator value. |
Definition at line 4892 of file data_set.cpp.
void set_separator | ( | const Separator & | new_separator | ) |
Sets a new separator.
new_separator | Separator value. |
Definition at line 4883 of file data_set.cpp.
void set_separator | ( | const string & | new_separator_string | ) |
Sets a new separator from a string.
new_separator | Char with the separator value. |
Definition at line 4926 of file data_set.cpp.
void set_steps_ahead_number | ( | const Index & | new_steps_ahead_number | ) |
Sets a new number of steps ahead to be defined for a time series prediction application. When loading the data file, the time series data will be modified according to this number.
new_steps_ahead_number | Number of steps ahead to be used. |
Definition at line 5032 of file data_set.cpp.
void set_target | ( | ) |
Sets all the variables in the data set as target variables.
Definition at line 3441 of file data_set.cpp.
void set_testing | ( | ) |
Sets all the samples in the data set for testing.
Definition at line 1501 of file data_set.cpp.
void set_testing | ( | const Tensor< Index, 1 > & | indices | ) |
Sets samples with given indices in the data set for testing.
indices | Indices vector with the index of samples in the data set for testing. |
Definition at line 1547 of file data_set.cpp.
void set_threads_number | ( | const int & | new_threads_number | ) |
Definition at line 5047 of file data_set.cpp.
void set_time_column | ( | const string & | new_time_column | ) |
Sets the new position where the time data is located in the data set.
new_time_index | Position where the time data is located. |
Definition at line 5041 of file data_set.cpp.
void set_time_series_columns_number | ( | const Index & | new_variables_number | ) |
Definition at line 4847 of file data_set.cpp.
void set_time_series_data | ( | const Tensor< type, 2 > & | new_data | ) |
Definition at line 4841 of file data_set.cpp.
void set_training | ( | ) |
Sets all the samples in the data set for training.
Definition at line 1475 of file data_set.cpp.
void set_training | ( | const Tensor< Index, 1 > & | indices | ) |
Sets samples with given indices in the data set for training.
indices | Indices vector with the index of samples in the data set for training. |
Definition at line 1515 of file data_set.cpp.
void set_variable_name | ( | const Index & | variable_index, |
const string & | new_variable_name | ||
) |
This method set the name of a single variable.
index | Index of variable. |
new_name | Name of variable. |
Definition at line 3295 of file data_set.cpp.
void set_variables_names | ( | const Tensor< string, 1 > & | new_variables_names | ) |
Sets new names for the variables in the data set from a vector of strings. The size of that vector must be equal to the total number of variables.
new_names | Name of variables. |
Definition at line 3355 of file data_set.cpp.
void set_variables_unused | ( | ) |
Sets all the variables in the data set as unused variables.
Definition at line 3452 of file data_set.cpp.
void shuffle | ( | ) |
Definition at line 11113 of file data_set.cpp.
|
private |
Definition at line 9082 of file data_set.cpp.
Tensor< Index, 2 > split_samples | ( | const Tensor< Index, 1 > & | samples_indices, |
const Index & | new_batch_size | ||
) | const |
Definition at line 10981 of file data_set.cpp.
void split_samples_random | ( | const type & | training_samples_ratio = static_cast<type>(0.6) , |
const type & | selection_samples_ratio = static_cast<type>(0.2) , |
||
const type & | testing_samples_ratio = static_cast<type>(0.2) |
||
) |
Creates new training, selection and testing indices at random.
training_samples_ratio | Ratio of training samples in the data set. |
selection_samples_ratio | Ratio of selection samples in the data set. |
testing_samples_ratio | Ratio of testing samples in the data set. |
Definition at line 1726 of file data_set.cpp.
void split_samples_sequential | ( | const type & | training_samples_ratio = static_cast<type>(0.6) , |
const type & | selection_samples_ratio = static_cast<type>(0.2) , |
||
const type & | testing_samples_ratio = static_cast<type>(0.2) |
||
) |
Creates new training, selection and testing indices with sequential indices.
training_samples_ratio | Ratio of training samples in the data set. |
selection_samples_ratio | Ratio of selection samples in the data set. |
testing_samples_ratio | Ratio of testing samples in the data set. |
Definition at line 1834 of file data_set.cpp.
Tensor< type, 2 > transform_binary_column | ( | const Tensor< type, 1 > & | column | ) | const |
Definition at line 3620 of file data_set.cpp.
void transform_time_series | ( | ) |
Arranges an input-target DataSet from a time series matrix, according to the number of lags.
Definition at line 8007 of file data_set.cpp.
void transform_time_series_columns | ( | ) |
This method transforms the columns into time series for forecasting problems.
Definition at line 790 of file data_set.cpp.
void transform_time_series_data | ( | ) |
Definition at line 870 of file data_set.cpp.
void unscale_data | ( | const Tensor< Descriptives, 1 > & | variables_descriptives | ) |
Definition at line 6197 of file data_set.cpp.
void unscale_input_variables | ( | const Tensor< Descriptives, 1 > & | input_variables_descriptives | ) |
It unscales every input variable with the given method. The method to be used is that in the scaling and unscaling method variable.
Definition at line 6351 of file data_set.cpp.
void unscale_target_variables | ( | const Tensor< Descriptives, 1 > & | targets_descriptives | ) |
It unscales the input variables with that values. The method to be used is that in the scaling and unscaling method variable.
Definition at line 6397 of file data_set.cpp.
Tensor< string, 1 > unuse_constant_columns | ( | ) |
Removes the input of target indices of that variables with zero standard deviation. It might change the size of the vectors containing the inputs and targets indices.
Definition at line 5073 of file data_set.cpp.
Tensor< Index, 1 > unuse_repeated_samples | ( | ) |
Removes the training, selection and testing indices of that samples which are repeated in the data matrix. It might change the size of the vectors containing the training, selection and testing indices.
Definition at line 5112 of file data_set.cpp.
void unuse_Tukey_outliers | ( | const type & | cleaning_parameter = type(1.5) | ) |
Calculate the outliers from the data set using the Tukey's test and sets in samples object.
cleaning_parameter | Parameter used to detect outliers |
Definition at line 8624 of file data_set.cpp.
Tensor< string, 1 > unuse_uncorrelated_columns | ( | const type & | minimum_correlation = type(0.25) | ) |
Return unused variables without correlation.
minimum_correlation | Minimum correlation between variables. |
Definition at line 5163 of file data_set.cpp.
void write_XML | ( | tinyxml2::XMLPrinter & | file_stream | ) | const |
Serializes the data set object into a XML document of the TinyXML library without keep the DOM tree in memory.
Definition at line 6486 of file data_set.cpp.
|
private |
Definition at line 764 of file data_set.h.
|
private |
Definition at line 826 of file data_set.h.
|
private |
Data Matrix. The number of rows is the number of samples. The number of columns is the number of variables.
Definition at line 754 of file data_set.h.
|
private |
Data file name.
Definition at line 772 of file data_set.h.
|
private |
Definition at line 790 of file data_set.h.
|
private |
Display messages to screen.
Definition at line 832 of file data_set.h.
|
private |
Definition at line 814 of file data_set.h.
|
private |
Header which contains variables name.
Definition at line 784 of file data_set.h.
|
private |
Header which contains the rows label.
Definition at line 788 of file data_set.h.
|
private |
Definition at line 766 of file data_set.h.
|
private |
Number of lags.
Definition at line 800 of file data_set.h.
|
private |
Missing values label.
Definition at line 780 of file data_set.h.
|
private |
Missing values method.
Definition at line 820 of file data_set.h.
|
private |
Missing values.
Definition at line 824 of file data_set.h.
|
private |
Definition at line 745 of file data_set.h.
|
private |
Definition at line 834 of file data_set.h.
|
private |
Definition at line 760 of file data_set.h.
|
private |
Definition at line 828 of file data_set.h.
|
private |
Definition at line 758 of file data_set.h.
|
private |
Separator character.
Definition at line 776 of file data_set.h.
|
private |
Number of steps ahead.
Definition at line 804 of file data_set.h.
|
private |
Definition at line 746 of file data_set.h.
|
private |
Index where time variable is located for forecasting applications.
Definition at line 796 of file data_set.h.
|
private |
Definition at line 812 of file data_set.h.
|
private |
Time series data matrix. The number of rows is the number of samples before time series transformation. The number of columns is the number of variables before time series transformation.
Definition at line 810 of file data_set.h.