OpenNN  2.2
Open Neural Networks Library
Public Types | Public Member Functions | Static Public Member Functions | Private Member Functions | Private Attributes | List of all members
OpenNN::DataSet Class Reference

#include <data_set.h>

Public Types

enum  Separator { Space, Tab, Comma, Semicolon }
 
enum  ScalingUnscalingMethod { NoScaling, NoUnscaling, MinimumMaximum, MeanStandardDeviation }
 
enum  AngularUnits { Radians, Degrees }
 
enum  FileType {
  TXT, DAT, DATA, CSV,
  ODS, XLSX, ARFF
}
 
enum  ProjectType { Approximation, Classification, Forecasting, Association }
 

Public Member Functions

 DataSet (void)
 
 DataSet (const Matrix< double > &)
 
 DataSet (const size_t &, const size_t &)
 
 DataSet (const size_t &, const size_t &, const size_t &)
 
 DataSet (const tinyxml2::XMLDocument &)
 
 DataSet (const std::string &)
 
 DataSet (const DataSet &)
 
virtual ~DataSet (void)
 
DataSetoperator= (const DataSet &)
 
bool operator== (const DataSet &) const
 
FileType get_file_type (void) const
 
std::string write_file_type (void) const
 
std::string write_first_cell (void) const
 
std::string write_last_cell (void) const
 
size_t write_sheet_number (void) const
 
ProjectType get_learning_task (void) const
 
std::string write_learning_task (void) const
 
const std::string & get_data_file_name (void) const
 
const bool & get_header_line (void) const
 
const bool & get_rows_label (void) const
 
const Separatorget_separator (void) const
 
std::string get_separator_string (void) const
 
std::string write_separator (void) const
 
const std::string & get_missing_values_label (void) const
 
const size_t & get_lags_number (void) const
 
const size_t & get_steps_ahead (void) const
 
const bool & get_autoassociation (void) const
 
const Vector< size_t > & get_angular_variables (void) const
 
const AngularUnitsget_angular_units (void) const
 
const MissingValuesget_missing_values (void) const
 
MissingValuesget_missing_values_pointer (void)
 
const Variablesget_variables (void) const
 
Variablesget_variables_pointer (void)
 
const Instancesget_instances (void) const
 
Instancesget_instances_pointer (void)
 
const bool & get_display (void) const
 
bool is_binary_classification (void) const
 
bool is_multiple_classification (void) const
 
bool is_binary_variable (const size_t &) const
 
bool empty (void) const
 
const Matrix< double > & get_data (void) const
 
const Matrix< double > & get_time_series_data (void) const
 
Matrix< double > get_instances_submatrix_data (const Vector< size_t > &) const
 
Matrix< double > arrange_training_data (void) const
 
Matrix< double > arrange_selection_data (void) const
 
Matrix< double > arrange_testing_data (void) const
 
Matrix< double > arrange_input_data (void) const
 
Matrix< double > arrange_target_data (void) const
 
Matrix< double > arrange_used_input_data (void) const
 
Matrix< double > arrange_used_target_data (void) const
 
Matrix< double > arrange_training_input_data (void) const
 
Matrix< double > arrange_training_target_data (void) const
 
Matrix< double > arrange_selection_input_data (void) const
 
Matrix< double > arrange_selection_target_data (void) const
 
Matrix< double > arrange_testing_input_data (void) const
 
Matrix< double > arrange_testing_target_data (void) const
 
Vector< double > get_instance (const size_t &) const
 
Vector< double > get_instance (const size_t &, const Vector< size_t > &) const
 
Vector< double > get_variable (const size_t &) const
 
Vector< double > get_variable (const size_t &, const Vector< size_t > &) const
 
void set (void)
 
void set (const Matrix< double > &)
 
void set (const size_t &, const size_t &)
 
void set (const size_t &, const size_t &, const size_t &)
 
void set (const DataSet &)
 
void set (const tinyxml2::XMLDocument &)
 
void set (const std::string &)
 
void set_data (const Matrix< double > &)
 
void set_instances_number (const size_t &)
 
void set_variables_number (const size_t &)
 
void set_data_file_name (const std::string &)
 
void set_file_type (const FileType &)
 
void set_file_type (const std::string &)
 
void set_header_line (const bool &)
 
void set_rows_label (const bool &)
 
void set_separator (const Separator &)
 
void set_separator (const std::string &)
 
void set_missing_values_label (const std::string &)
 
void set_lags_number (const size_t &)
 
void set_steps_ahead_number (const size_t &)
 
void set_autoassociation (const bool &)
 
void set_learning_task (const ProjectType &)
 
void set_learning_task (const std::string &)
 
void set_angular_variables (const Vector< size_t > &)
 
void set_angular_units (AngularUnits &)
 
void set_display (const bool &)
 
void set_default (void)
 
void set_MPI (const DataSet *)
 
void set_instance (const size_t &, const Vector< double > &)
 
void add_instance (const Vector< double > &)
 
void subtract_instance (const size_t &)
 
void append_variable (const Vector< double > &)
 
void subtract_variable (const size_t &)
 
Vector< size_t > unuse_constant_variables (void)
 
Vector< size_t > unuse_repeated_instances (void)
 
Vector< size_t > unuse_non_significant_inputs (void)
 
void initialize_data (const double &)
 
void randomize_data_uniform (const double &minimum=-1.0, const double &maximum=1.0)
 
void randomize_data_normal (const double &mean=0.0, const double &standard_deviation=1.0)
 
Vector< Statistics< double > > calculate_data_statistics (void) const
 
Vector< Vector< double > > calculate_data_shape_parameters (void) const
 
Matrix< double > calculate_data_statistics_matrix (void) const
 
Matrix< double > calculate_positives_data_statistics_matrix (void) const
 
Matrix< double > calculate_negatives_data_statistics_matrix (void) const
 
Matrix< double > calculate_data_shape_parameters_matrix (void) const
 
Vector< Statistics< double > > calculate_training_instances_statistics (void) const
 
Vector< Statistics< double > > calculate_selection_instances_statistics (void) const
 
Vector< Statistics< double > > calculate_testing_instances_statistics (void) const
 
Vector< Vector< double > > calculate_training_instances_shape_parameters (void) const
 
Vector< Vector< double > > calculate_selection_instances_shape_parameters (void) const
 
Vector< Vector< double > > calculate_testing_instances_shape_parameters (void) const
 
Vector< Statistics< double > > calculate_inputs_statistics (void) const
 
Vector< Statistics< double > > calculate_targets_statistics (void) const
 
Vector< double > calculate_training_target_data_mean (void) const
 
Vector< double > calculate_selection_target_data_mean (void) const
 
Vector< double > calculate_testing_target_data_mean (void) const
 
Matrix< double > calculate_linear_correlations (void) const
 
Matrix< double > calculate_covariance_matrix (void) const
 
Matrix< double > perform_principal_components_analysis (const double &=0.0)
 
Matrix< double > perform_principal_components_analysis (const Matrix< double > &, const Vector< double > &, const double &=0.0)
 
void transform_principal_components_data (const Matrix< double > &)
 
void subtract_input_data_mean (void)
 
Vector< Histogram< double > > calculate_data_histograms (const size_t &=10) const
 
Vector< Histogram< double > > calculate_targets_histograms (const size_t &=10) const
 
Vector< Vector< double > > calculate_box_plots (void) const
 
size_t calculate_training_negatives (const size_t &) const
 
size_t calculate_selection_negatives (const size_t &) const
 
size_t calculate_testing_negatives (const size_t &) const
 
Vector< size_t > filter_data (const Vector< double > &, const Vector< double > &)
 
void scale_data_minimum_maximum (const Vector< Statistics< double > > &)
 
void scale_data_mean_standard_deviation (const Vector< Statistics< double > > &)
 
Vector< Statistics< double > > scale_data_minimum_maximum (void)
 
Vector< Statistics< double > > scale_data_mean_standard_deviation (void)
 
void scale_inputs_minimum_maximum (const Vector< Statistics< double > > &)
 
Vector< Statistics< double > > scale_inputs_minimum_maximum (void)
 
void scale_inputs_mean_standard_deviation (const Vector< Statistics< double > > &)
 
Vector< Statistics< double > > scale_inputs_mean_standard_deviation (void)
 
Vector< Statistics< double > > scale_inputs (const std::string &)
 
void scale_inputs (const std::string &, const Vector< Statistics< double > > &)
 
void scale_targets_minimum_maximum (const Vector< Statistics< double > > &)
 
Vector< Statistics< double > > scale_targets_minimum_maximum (void)
 
void scale_targets_mean_standard_deviation (const Vector< Statistics< double > > &)
 
Vector< Statistics< double > > scale_targets_mean_standard_deviation (void)
 
Vector< Statistics< double > > scale_targets (const std::string &)
 
void scale_targets (const std::string &, const Vector< Statistics< double > > &)
 
void unscale_data_minimum_maximum (const Vector< Statistics< double > > &)
 
void unscale_data_mean_standard_deviation (const Vector< Statistics< double > > &)
 
void unscale_inputs_minimum_maximum (const Vector< Statistics< double > > &)
 
void unscale_inputs_mean_standard_deviation (const Vector< Statistics< double > > &)
 
void unscale_targets_minimum_maximum (const Vector< Statistics< double > > &)
 
void unscale_targets_mean_standard_deviation (const Vector< Statistics< double > > &)
 
Vector< size_t > calculate_target_distribution (void) const
 
Vector< double > calculate_distances (void) const
 
Vector< size_t > balance_binary_targets_distribution (const double &=100.0)
 
Vector< size_t > balance_multiple_targets_distribution (void)
 
Vector< size_t > unuse_most_populated_target (const size_t &)
 
Vector< size_t > balance_approximation_targets_distribution (const double &=10.0)
 
Vector< size_t > arrange_binary_inputs_indices (void) const
 
Vector< size_t > arrange_real_inputs_indices (void) const
 
void sum_binary_inputs (void)
 
Matrix< double > calculate_instances_distances (const size_t &) const
 
Matrix< size_t > calculate_nearest_neighbors (const Matrix< double > &, const size_t &) const
 
Vector< double > calculate_k_distances (const Matrix< double > &, const size_t &) const
 
Matrix< double > calculate_reachability_distances (const Matrix< double > &, const Vector< double > &) const
 
Vector< double > calculate_reachability_density (const Matrix< double > &, const size_t &) const
 
Vector< double > calculate_local_outlier_factor (const size_t &=5) const
 
Vector< size_t > clean_local_outlier_factor (const size_t &=5)
 
Vector< size_t > calculate_Tukey_outliers (const size_t &, const double &=1.5) const
 
Vector< Vector< size_t > > calculate_Tukey_outliers (const double &=1.5) const
 
Matrix< double > calculate_autocorrelation (const size_t &=10) const
 
Matrix< Vector< double > > calculate_cross_correlation (void) const
 
void generate_data_approximation (const size_t &, const size_t &)
 
void generate_data_binary_classification (const size_t &, const size_t &)
 
void generate_data_multiple_classification (const size_t &, const size_t &)
 
std::string to_string (void) const
 
void print (void) const
 
void print_summary (void) const
 
tinyxml2::XMLDocument * to_XML (void) const
 
void from_XML (const tinyxml2::XMLDocument &)
 
void write_XML (tinyxml2::XMLPrinter &) const
 
void save (const std::string &) const
 
void load (const std::string &)
 
void print_data (void) const
 
void print_data_preview (void) const
 
void save_data (void) const
 
bool has_data (void) const
 
void load_data (void)
 
void load_data_binary (void)
 
void load_time_series_data_binary (void)
 
Vector< std::string > arrange_time_series_names (const Vector< std::string > &) const
 
Vector< std::string > arrange_association_names (const Vector< std::string > &) const
 
void convert_time_series (void)
 
void convert_association (void)
 
void convert_angular_variable_degrees (const size_t &)
 
void convert_angular_variable_radians (const size_t &)
 
void convert_angular_variables_degrees (const Vector< size_t > &)
 
void convert_angular_variables_radians (const Vector< size_t > &)
 
void convert_angular_variables (void)
 
void scrub_missing_values_unuse (void)
 
void scrub_missing_values_mean (void)
 
void scrub_missing_values (void)
 
size_t count_tokens (std::string &) const
 
Vector< std::string > get_tokens (const std::string &) const
 
bool is_numeric (const std::string &) const
 
void trim (std::string &) const
 
std::string get_trimmed (const std::string &) const
 
std::string prepend (const std::string &, const std::string &) const
 
bool is_numeric (const Vector< std::string > &) const
 
bool is_not_numeric (const Vector< std::string > &) const
 
bool is_mixed (const Vector< std::string > &) const
 

Static Public Member Functions

static ScalingUnscalingMethod get_scaling_unscaling_method (const std::string &)
 

Private Member Functions

size_t get_column_index (const Vector< Vector< std::string > > &, const size_t) const
 
void check_separator (const std::string &) const
 
size_t count_data_file_columns_number (void) const
 
void check_header_line (void)
 
Vector< std::string > read_header_line (void) const
 
void read_instance (const std::string &, const Vector< Vector< std::string > > &, const size_t &)
 
Vector< Vector< std::string > > set_from_data_file (void)
 
void read_from_data_file (const Vector< Vector< std::string > > &)
 

Private Attributes

FileType file_type
 
std::string first_cell
 
std::string last_cell
 
size_t sheet_number
 
std::string data_file_name
 
bool header_line
 
bool rows_label
 
Separator separator
 
std::string missing_values_label
 
size_t lags_number
 
size_t steps_ahead
 
bool association
 
ProjectType learning_task
 
Vector< size_t > angular_variables
 
AngularUnits angular_units
 
Matrix< double > data
 
Matrix< double > time_series_data
 
Variables variables
 
Instances instances
 
MissingValues missing_values
 
bool display
 

Detailed Description

This class represents the concept of data set for data modelling problems, such as function regression, classification and time series prediction. It basically consists of a data matrix plus a variables and an instances objects.

Definition at line 53 of file data_set.h.

Constructor & Destructor Documentation

◆ DataSet() [1/7]

OpenNN::DataSet::DataSet ( void  )
explicit

Default constructor. It creates a data set object with zero instances and zero inputs and target variables. It also initializes the rest of class members to their default values.

Definition at line 27 of file data_set.cpp.

◆ DataSet() [2/7]

OpenNN::DataSet::DataSet ( const Matrix< double > &  data)
explicit

Data constructor. It creates a data set object from a data matrix. It also initializes the rest of class members to their default values.

Parameters
dataData matrix.

Definition at line 41 of file data_set.cpp.

◆ DataSet() [3/7]

OpenNN::DataSet::DataSet ( const size_t &  new_instances_number,
const size_t &  new_variables_number 
)
explicit

Instances and variables number constructor. It creates a data set object with given instances and variables numbers. All the variables are set as inputs. It also initializes the rest of class members to their default values.

Parameters
new_instances_numberNumber of instances in the data set.
new_variables_numberNumber of variables.

Definition at line 58 of file data_set.cpp.

◆ DataSet() [4/7]

OpenNN::DataSet::DataSet ( const size_t &  new_instances_number,
const size_t &  new_inputs_number,
const size_t &  new_targets_number 
)
explicit

Instances number, input variables number and target variables number constructor. It creates a data set object with given instances and inputs and target variables numbers. It also initializes the rest of class members to their default values.

Parameters
new_instances_numberNumber of instances in the data set.
new_inputs_numberNumber of input variables.
new_targets_numberNumber of target variables.

Definition at line 75 of file data_set.cpp.

◆ DataSet() [5/7]

OpenNN::DataSet::DataSet ( const tinyxml2::XMLDocument &  data_set_document)
explicit

Sets the data set members from a XML document.

Parameters
data_set_documentTinyXML document containing the member data.

Definition at line 88 of file data_set.cpp.

◆ DataSet() [6/7]

OpenNN::DataSet::DataSet ( const std::string &  file_name)
explicit

File constructor. It creates a data set object by loading the object members from a XML-type file. Please mind about the file format. This is specified in the User's Guide.

Parameters
file_nameData set file name.

Definition at line 102 of file data_set.cpp.

◆ DataSet() [7/7]

OpenNN::DataSet::DataSet ( const DataSet other_data_set)

Copy constructor. It creates a copy of an existing inputs targets data set object.

Parameters
other_data_setData set object to be copied.

Definition at line 118 of file data_set.cpp.

Member Function Documentation

◆ add_instance()

void OpenNN::DataSet::add_instance ( const Vector< double > &  instance)

Adds a new instance to the data matrix from a vector of real numbers. The size of that vector must be equal to the number of variables. Note that resizing is here necessary and therefore computationally expensive. All instances are also set for training.

Parameters
instanceInput and target values of the instance to be added.

Definition at line 1928 of file data_set.cpp.

◆ append_variable()

void OpenNN::DataSet::append_variable ( const Vector< double > &  variable)

Appends a variable with given values to the data matrix.

Parameters
variableVector of values. The size must be equal to the number of instances.

Definition at line 1998 of file data_set.cpp.

◆ arrange_association_names()

Vector< std::string > OpenNN::DataSet::arrange_association_names ( const Vector< std::string > &  ) const

Returns a vector with the names arranged for association.

Todo:

Definition at line 5468 of file data_set.cpp.

◆ arrange_input_data()

Matrix< double > OpenNN::DataSet::arrange_input_data ( void  ) const

Returns a matrix with the input variables in the data set. The number of rows is the number of instances. The number of columns is the number of input variables.

Definition at line 797 of file data_set.cpp.

◆ arrange_selection_data()

Matrix< double > OpenNN::DataSet::arrange_selection_data ( void  ) const

Returns a matrix with the selection instances in the data set. The number of rows is the number of selection instances. The number of columns is the number of variables.

Definition at line 762 of file data_set.cpp.

◆ arrange_selection_input_data()

Matrix< double > OpenNN::DataSet::arrange_selection_input_data ( void  ) const

Returns a matrix with selection instances and input variables. The number of rows is the number of selection instances. The number of columns is the number of input variables.

Definition at line 893 of file data_set.cpp.

◆ arrange_selection_target_data()

Matrix< double > OpenNN::DataSet::arrange_selection_target_data ( void  ) const

Returns a matrix with selection instances and target variables. The number of rows is the number of selection instances. The number of columns is the number of target variables.

Definition at line 909 of file data_set.cpp.

◆ arrange_target_data()

Matrix< double > OpenNN::DataSet::arrange_target_data ( void  ) const

Returns a matrix with the target variables in the data set. The number of rows is the number of instances. The number of columns is the number of target variables.

Definition at line 814 of file data_set.cpp.

◆ arrange_testing_data()

Matrix< double > OpenNN::DataSet::arrange_testing_data ( void  ) const

Returns a matrix with the testing instances in the data set. The number of rows is the number of testing instances. The number of columns is the number of variables.

Definition at line 780 of file data_set.cpp.

◆ arrange_testing_input_data()

Matrix< double > OpenNN::DataSet::arrange_testing_input_data ( void  ) const

Returns a matrix with testing instances and input variables. The number of rows is the number of testing instances. The number of columns is the number of input variables.

Definition at line 925 of file data_set.cpp.

◆ arrange_testing_target_data()

Matrix< double > OpenNN::DataSet::arrange_testing_target_data ( void  ) const

Returns a matrix with testing instances and target variables. The number of rows is the number of testing instances. The number of columns is the number of target variables.

Definition at line 941 of file data_set.cpp.

◆ arrange_time_series_names()

Vector< std::string > OpenNN::DataSet::arrange_time_series_names ( const Vector< std::string > &  ) const

Returns a vector with the names arranged for time series prediction, according to the number of lags.

Todo:

Definition at line 5437 of file data_set.cpp.

◆ arrange_training_data()

Matrix< double > OpenNN::DataSet::arrange_training_data ( void  ) const

Returns a matrix with the training instances in the data set. The number of rows is the number of training instances. The number of columns is the number of variables.

Definition at line 744 of file data_set.cpp.

◆ arrange_training_input_data()

Matrix< double > OpenNN::DataSet::arrange_training_input_data ( void  ) const

Returns a matrix with training instances and input variables. The number of rows is the number of training instances. The number of columns is the number of input variables.

Definition at line 861 of file data_set.cpp.

◆ arrange_training_target_data()

Matrix< double > OpenNN::DataSet::arrange_training_target_data ( void  ) const

Returns a matrix with training instances and target variables. The number of rows is the number of training instances. The number of columns is the number of target variables.

Definition at line 877 of file data_set.cpp.

◆ arrange_used_input_data()

Matrix< double > OpenNN::DataSet::arrange_used_input_data ( void  ) const

Returns a matrix with the input variables of the used instances in the data set. The number of rows is the number of used instances. The number of columns is the number of input variables.

Definition at line 830 of file data_set.cpp.

◆ arrange_used_target_data()

Matrix< double > OpenNN::DataSet::arrange_used_target_data ( void  ) const

Returns a matrix with the target variables of the used instances in the data set. The number of rows is the number of used instances. The number of columns is the number of target variables.

Definition at line 846 of file data_set.cpp.

◆ balance_approximation_targets_distribution()

Vector< size_t > OpenNN::DataSet::balance_approximation_targets_distribution ( const double &  percentage = 10.0)

This method balances the target ditribution of a data set for a function regression problem. It returns a vector with the indices of the instances set unused. It unuses a given percentage of the instances.

Parameters
percentagePercentage of the instances to be unused.

Definition at line 6178 of file data_set.cpp.

◆ balance_binary_targets_distribution()

Vector< size_t > OpenNN::DataSet::balance_binary_targets_distribution ( const double &  percentage = 100.0)

This method balances the targets ditribution of a data set with only one target variable by unusing instances whose target variable belongs to the most populated target class. It returns a vector with the indices of the instances set unused.

Parameters
percentagePercentage of instances to be unused.

Definition at line 5894 of file data_set.cpp.

◆ balance_multiple_targets_distribution()

Vector< size_t > OpenNN::DataSet::balance_multiple_targets_distribution ( void  )

This method balances the targets ditribution of a data set with more than one target variable by unusing instances whose target variable belongs to the most populated target class. It returns a vector with the indices of the instances set unused.

Definition at line 5949 of file data_set.cpp.

◆ calculate_autocorrelation()

Matrix< double > OpenNN::DataSet::calculate_autocorrelation ( const size_t &  maximum_lags_number = 10) const

Returns a matrix with the values of autocorrelation for every variable in the data set. The number of rows is equal to the number of instances. The number of columns is the maximum lags number.

Parameters
maximum_lags_numberMaximum lags number for which autocorrelation is calculated.

Definition at line 6655 of file data_set.cpp.

◆ calculate_box_plots()

Vector< Vector< double > > OpenNN::DataSet::calculate_box_plots ( void  ) const

Returns a vector of subvectors with the values of a box and whiskers plot. The size of the vector is equal to the number of used variables. The size of the subvectors is 5 and they consist on:

  • Minimum
  • First quartile
  • Second quartile
  • Third quartile
  • Maximum

Definition at line 2321 of file data_set.cpp.

◆ calculate_covariance_matrix()

Matrix< double > OpenNN::DataSet::calculate_covariance_matrix ( void  ) const

Returns the covariance matrix for the input data set. The number of rows of the matrix is the number of inputs. The number of columns of the matrix is the number of inputs.

Definition at line 2980 of file data_set.cpp.

◆ calculate_data_histograms()

Vector< Histogram< double > > OpenNN::DataSet::calculate_data_histograms ( const size_t &  bins_number = 10) const

Returns a histogram for each variable with a given number of bins. The default number of bins is 10. The format is a vector of subvectors of subsubvectors. The size of the vector is the number of variables. The size of the subvectors is 2 (centers and frequencies). The size of the subsubvectors is the number of bins.

Parameters
bins_numberNumber of bins.

Definition at line 2236 of file data_set.cpp.

◆ calculate_data_shape_parameters()

Vector< Vector< double > > OpenNN::DataSet::calculate_data_shape_parameters ( void  ) const

Returns a vector fo subvectors containing the shape parameters for all the variables in the data set. The size of this vector is 2. The subvectors are:

  • Asymmetry.
  • Kurtosis.

Definition at line 2493 of file data_set.cpp.

◆ calculate_data_shape_parameters_matrix()

Matrix< double > OpenNN::DataSet::calculate_data_shape_parameters_matrix ( void  ) const

Returns all the variables shape parameters from a single matrix. The number of rows is the number of used variables. The number of columns is two (asymmetry, kurtosis).

Definition at line 2668 of file data_set.cpp.

◆ calculate_data_statistics()

Vector< Statistics< double > > OpenNN::DataSet::calculate_data_statistics ( void  ) const

Returns a vector of vectors containing some basic statistics of all the variables in the data set. The size of this vector is four. The subvectors are:

  • Minimum.
  • Maximum.
  • Mean.
  • Standard deviation.

Definition at line 2476 of file data_set.cpp.

◆ calculate_data_statistics_matrix()

Matrix< double > OpenNN::DataSet::calculate_data_statistics_matrix ( void  ) const

Returns all the variables statistics from a single matrix. The number of rows is the number of used variables. The number of columns is five (minimum, maximum, mean and standard deviation).

Definition at line 2507 of file data_set.cpp.

◆ calculate_distances()

Vector< double > OpenNN::DataSet::calculate_distances ( void  ) const

Returns a normalized distance between each instance and the mean instance. The size of this vector is the number of instances.

Definition at line 5859 of file data_set.cpp.

◆ calculate_inputs_statistics()

Vector< Statistics< double > > OpenNN::DataSet::calculate_inputs_statistics ( void  ) const

Returns a vector of vectors with some basic statistics of the input variables on all instances. The size of this vector is five. The subvectors are:

  • Input variables minimum.
  • Input variables maximum.
  • Input variables mean.
  • Input variables standard deviation.

Definition at line 2828 of file data_set.cpp.

◆ calculate_instances_distances()

Matrix< double > OpenNN::DataSet::calculate_instances_distances ( const size_t &  nearest_neighbours_number) const

Returns a matrix with the distances between every instance and the rest of the instances. The number of rows is the number of instances in the data set. The number of columns is the number of instances in the data set.

Parameters
nearest_neighbours_numberNearest neighbors number.

Definition at line 6297 of file data_set.cpp.

◆ calculate_k_distances()

Vector< double > OpenNN::DataSet::calculate_k_distances ( const Matrix< double > &  distances,
const size_t &  nearest_neighbours_number 
) const

Returns a vector with the k-distance of every instance in the data set, which is the distance between every instance and k-th nearest neighbor.

Parameters
distancesDistances between every instance in the data set.
nearest_neighbours_numberNumber of nearest neighbors to be calculated.

Definition at line 6375 of file data_set.cpp.

◆ calculate_linear_correlations()

Matrix< double > OpenNN::DataSet::calculate_linear_correlations ( void  ) const

Calculates the linear correlations between all outputs and all inputs. It returns a matrix with number of rows the targets number and number of columns the inputs number. Each element contains the linear correlation between a single target and a single output.

Definition at line 2933 of file data_set.cpp.

◆ calculate_local_outlier_factor()

Vector< double > OpenNN::DataSet::calculate_local_outlier_factor ( const size_t &  nearest_neighbours_number = 5) const

Returns a vector with the local outlier factors for every used instance.

Parameters
nearest_neighbours_numberNumber of neighbors to be calculated.

Definition at line 6466 of file data_set.cpp.

◆ calculate_nearest_neighbors()

Matrix< size_t > OpenNN::DataSet::calculate_nearest_neighbors ( const Matrix< double > &  distances,
const size_t &  nearest_neighbours_number 
) const

Returns a matrix with the k-nearest neighbors to every used instance in the data set. Number of rows is the number of isntances in the data set. Number of columns is the number of nearest neighbors to calculate.

Parameters
distancesDistances between every instance and the rest of them.
nearest_neighbours_numberNumber of nearest neighbors to be calculated.

Definition at line 6344 of file data_set.cpp.

◆ calculate_reachability_density()

Vector< double > OpenNN::DataSet::calculate_reachability_density ( const Matrix< double > &  distances,
const size_t &  nearest_neighbours_number 
) const

Calculates reachability density for every element of the data set.

Parameters
distancesDistances between every instance in the data set.
nearest_neighbours_numberNumber of nearest neighbors to be calculated.

Definition at line 6435 of file data_set.cpp.

◆ calculate_reachability_distances()

Matrix< double > OpenNN::DataSet::calculate_reachability_distances ( const Matrix< double > &  distances,
const Vector< double > &  k_distances 
) const

Calculates the reachability distances for the instances in the data set.

Parameters
distancesDistances between every instance.
k_distancesDistances of the k-th nearest neighbors.

Definition at line 6402 of file data_set.cpp.

◆ calculate_selection_instances_shape_parameters()

Vector< Vector< double > > OpenNN::DataSet::calculate_selection_instances_shape_parameters ( void  ) const

Returns a vector of vectors containing some shape parameters of all variables on the selection instances. The size of this vector is five. The subvectors are:

  • Selection data asymmetry.
  • Selection data kurtosis.

Definition at line 2787 of file data_set.cpp.

◆ calculate_selection_instances_statistics()

Vector< Statistics< double > > OpenNN::DataSet::calculate_selection_instances_statistics ( void  ) const

Returns a vector of vectors containing some basic statistics of all variables on the selection instances. The size of this vector is two. The subvectors are:

  • Selection data minimum.
  • Selection data maximum.
  • Selection data mean.
  • Selection data standard deviation.

Definition at line 2728 of file data_set.cpp.

◆ calculate_selection_negatives()

size_t OpenNN::DataSet::calculate_selection_negatives ( const size_t &  target_index) const

Counts the number of negatives of the selected target in the selection data.

Parameters
target_indexIndex of the target to evaluate.

Definition at line 2392 of file data_set.cpp.

◆ calculate_target_distribution()

Vector< size_t > OpenNN::DataSet::calculate_target_distribution ( void  ) const
Todo:
This method is not implemented.

Returns a vector containing the number of instances of each class in the data set. If the number of target variables is one then the number of classes is two. If the number of target variables is greater than one then the number of classes is equal to the number of target variables.

Definition at line 5775 of file data_set.cpp.

◆ calculate_targets_histograms()

Vector< Histogram< double > > OpenNN::DataSet::calculate_targets_histograms ( const size_t &  bins_number = 10) const

Returns a histogram for each target variable with a given number of bins. The default number of bins is 10. The format is a vector of subvectors of subsubvectors. The size of the vector is the number of variables. The size of the subvectors is 2 (centers and frequencies). The size of the subsubvectors is the number of bins.

Parameters
bins_numberNumber of bins.

Definition at line 2282 of file data_set.cpp.

◆ calculate_targets_statistics()

Vector< Statistics< double > > OpenNN::DataSet::calculate_targets_statistics ( void  ) const

Returns a vector of vectors with some basic statistics of the target variables on all instances. The size of this vector is five. The subvectors are:

  • Target variables minimum.
  • Target variables maximum.
  • Target variables mean.
  • Target variables standard deviation.

Definition at line 2859 of file data_set.cpp.

◆ calculate_testing_instances_shape_parameters()

Vector< Vector< double > > OpenNN::DataSet::calculate_testing_instances_shape_parameters ( void  ) const

Returns a vector of vectors containing some shape parameters of all variables on the testing instances. The size of this vector is five. The subvectors are:

  • Testing data asymmetry.
  • Testing data kurtosis.

Definition at line 2806 of file data_set.cpp.

◆ calculate_testing_instances_statistics()

Vector< Statistics< double > > OpenNN::DataSet::calculate_testing_instances_statistics ( void  ) const

Returns a vector of vectors containing some basic statistics of all variables on the testing instances. The size of this vector is five. The subvectors are:

  • Testing data minimum.
  • Testing data maximum.
  • Testing data mean.
  • Testing data standard deviation.

Definition at line 2749 of file data_set.cpp.

◆ calculate_testing_negatives()

size_t OpenNN::DataSet::calculate_testing_negatives ( const size_t &  target_index) const

Counts the number of negatives of the selected target in the testing data.

Parameters
target_indexIndex of the target to evaluate.

Definition at line 2431 of file data_set.cpp.

◆ calculate_training_instances_shape_parameters()

Vector< Vector< double > > OpenNN::DataSet::calculate_training_instances_shape_parameters ( void  ) const

Returns a vector of vectors containing some shape parameters of all variables on the training instances. The size of this vector is two. The subvectors are:

  • Training data asymmetry.
  • Training data kurtosis.

Definition at line 2768 of file data_set.cpp.

◆ calculate_training_instances_statistics()

Vector< Statistics< double > > OpenNN::DataSet::calculate_training_instances_statistics ( void  ) const

Returns a vector of vectors containing some basic statistics of all variables on the training instances. The size of this vector is two. The subvectors are:

  • Training data minimum.
  • Training data maximum.
  • Training data mean.
  • Training data standard deviation.

Definition at line 2707 of file data_set.cpp.

◆ calculate_training_negatives()

size_t OpenNN::DataSet::calculate_training_negatives ( const size_t &  target_index) const

Counts the number of negatives of the selected target in the training data.

Parameters
target_indexIndex of the target to evaluate.

Definition at line 2353 of file data_set.cpp.

◆ calculate_Tukey_outliers() [1/2]

Vector< size_t > OpenNN::DataSet::calculate_Tukey_outliers ( const size_t &  variable_index,
const double &  cleaning_parameter = 1.5 
) const

Calculate the outliers from the data set using the Tukey's test for a single variable.

Parameters
variable_indexIndex of the variable to calculate the outliers.
cleaning_parameterParameter used to detect outliers.

Definition at line 6524 of file data_set.cpp.

◆ calculate_Tukey_outliers() [2/2]

Vector< Vector< size_t > > OpenNN::DataSet::calculate_Tukey_outliers ( const double &  cleaning_parameter = 1.5) const

Calculate the outliers from the data set using the Tukey's test.

Parameters
cleaning_parameterParameter used to detect outliers.

Definition at line 6572 of file data_set.cpp.

◆ check_header_line()

void OpenNN::DataSet::check_header_line ( void  )
private

Verifies that the data file has a header line. All elements in a header line must be strings. This method can change the value of the header line member. It throws an exception if some inconsistencies are found.

Definition at line 4946 of file data_set.cpp.

◆ check_separator()

void OpenNN::DataSet::check_separator ( const std::string &  line) const
private

Verifies that a given line in the data file contains the separator characer. If the line does not contain the separator, this method throws an exception.

Parameters
lineData file line.

Definition at line 4874 of file data_set.cpp.

◆ clean_local_outlier_factor()

Vector< size_t > OpenNN::DataSet::clean_local_outlier_factor ( const size_t &  nearest_neighbours_number = 5)

Removes the outliers from the data set using the local outlier factor method.

Parameters
nearest_neighbours_numberNumber of nearest neighbros to calculate
Todo:

Definition at line 6496 of file data_set.cpp.

◆ convert_angular_variable_degrees()

void OpenNN::DataSet::convert_angular_variable_degrees ( const size_t &  variable_index)

Replaces a given angular variable expressed in degrees by the sinus and cosinus of that variable. This solves the discontinuity associated with angular variables. Note that this method modifies the number of variables.

Parameters
variable_indexIndex of angular variable.

Definition at line 6923 of file data_set.cpp.

◆ convert_angular_variable_radians()

void OpenNN::DataSet::convert_angular_variable_radians ( const size_t &  variable_index)

Replaces a given angular variable expressed in radians by the sinus and cosinus of that variable. This solves the discontinuity associated with angular variables. Note that this method modifies the number of variables.

Parameters
variable_indexIndex of angular variable.

Definition at line 6969 of file data_set.cpp.

◆ convert_angular_variables()

void OpenNN::DataSet::convert_angular_variables ( void  )

Replaces a given set of angular variables by the sinus and cosinus of that variable, according to the angular units used. This solves the discontinuity associated with angular variables. Note that this method modifies the number of variables.

Definition at line 7110 of file data_set.cpp.

◆ convert_angular_variables_degrees()

void OpenNN::DataSet::convert_angular_variables_degrees ( const Vector< size_t > &  indices)

Replaces a given set of angular variables expressed in degrees by the sinus and cosinus of that variable. This solves the discontinuity associated with angular variables. Note that this method modifies the number of variables.

Parameters
indicesIndices of angular variables.

Definition at line 7015 of file data_set.cpp.

◆ convert_angular_variables_radians()

void OpenNN::DataSet::convert_angular_variables_radians ( const Vector< size_t > &  indices)

Replaces a given set of angular variables expressed in radians by the sinus and cosinus of that variable. This solves the discontinuity associated with angular variables. Note that this method modifies the number of variables.

Parameters
indicesIndices of angular variables.

Definition at line 7063 of file data_set.cpp.

◆ convert_association()

void OpenNN::DataSet::convert_association ( void  )

Arranges the data set for association.

Todo:

Definition at line 5503 of file data_set.cpp.

◆ convert_time_series()

void OpenNN::DataSet::convert_time_series ( void  )

Arranges an input-target matrix from a time series matrix, according to the number of lags.

Todo:

Definition at line 5481 of file data_set.cpp.

◆ count_data_file_columns_number()

size_t OpenNN::DataSet::count_data_file_columns_number ( void  ) const
private

Returns the number of tokens in the first line of the data file. That will be interpreted as the number of columns in the data file.

Definition at line 4901 of file data_set.cpp.

◆ count_tokens()

size_t OpenNN::DataSet::count_tokens ( std::string &  str) const

Returns the number of strings delimited by separator. If separator does not match anywhere in the string, this method returns 0.

Parameters
strString to be tokenized.

Definition at line 7230 of file data_set.cpp.

◆ filter_data()

Vector< size_t > OpenNN::DataSet::filter_data ( const Vector< double > &  minimums,
const Vector< double > &  maximums 
)

Unuses those instances with values outside a defined range.

Parameters
minimumsVector of minimum values in the range. The size must be equal to the number of variables.
maximumsVector of maximum values in the range. The size must be equal to the number of variables.

Definition at line 6845 of file data_set.cpp.

◆ from_XML()

void OpenNN::DataSet::from_XML ( const tinyxml2::XMLDocument &  data_set_document)

Deserializes a TinyXML document into this data set object.

Parameters
data_set_documentXML document containing the member data.

Definition at line 4296 of file data_set.cpp.

◆ generate_data_approximation()

void OpenNN::DataSet::generate_data_approximation ( const size_t &  instances_number,
const size_t &  variables_number 
)

Generates an artificial dataset with a given number of instances and number of variables using the Rosenbrock function.

Parameters
instances_numberNumber of instances in the dataset.
variables_numberNumber of variables in the dataset.

Definition at line 6717 of file data_set.cpp.

◆ generate_data_binary_classification()

void OpenNN::DataSet::generate_data_binary_classification ( const size_t &  instances_number,
const size_t &  inputs_number 
)

Generate artificial data for a binary classification problem with a given number of instances and inputs.

Parameters
instances_numberNumber of the instances to generate.
inputs_numberNumber of the variables that the data set will have.

Definition at line 6777 of file data_set.cpp.

◆ generate_data_multiple_classification()

void OpenNN::DataSet::generate_data_multiple_classification ( const size_t &  ,
const size_t &   
)
Todo:

Definition at line 6813 of file data_set.cpp.

◆ get_angular_variables()

const Vector< size_t > & OpenNN::DataSet::get_angular_variables ( void  ) const

Returns the indices of the angular variables in the data set. When loading a data set with angular variables, a transformation of the data will be performed in order to avoid discontinuities (from 359 degrees to 1 degree).

Definition at line 686 of file data_set.cpp.

◆ get_autoassociation()

const bool & OpenNN::DataSet::get_autoassociation ( void  ) const

Returns true if the data set will be used for an association application, and false otherwise. In an association problem the target data is equal to the input data.

Definition at line 674 of file data_set.cpp.

◆ get_column_index()

size_t OpenNN::DataSet::get_column_index ( const Vector< Vector< std::string > > &  nominal_labels,
const size_t  column_index 
) const
private

Returns the index of a variable when reading the data file.

Parameters
nominal_labelsValues of all nominal variables in the data file.
column_indexIndex of column.

Definition at line 4848 of file data_set.cpp.

◆ get_data()

const Matrix< double > & OpenNN::DataSet::get_data ( void  ) const

Returns a reference to the data matrix in the data set. The number of rows is equal to the number of instances. The number of columns is equal to the number of variables.

Definition at line 334 of file data_set.cpp.

◆ get_display()

const bool & OpenNN::DataSet::get_display ( void  ) const

Returns true if messages from this class can be displayed on the screen, or false if messages from this class can't be displayed on the screen.

Definition at line 241 of file data_set.cpp.

◆ get_instance() [1/2]

Vector< double > OpenNN::DataSet::get_instance ( const size_t &  i) const

Returns the inputs and target values of a single instance in the data set.

Parameters
iIndex of the instance.

Definition at line 956 of file data_set.cpp.

◆ get_instance() [2/2]

Vector< double > OpenNN::DataSet::get_instance ( const size_t &  instance_index,
const Vector< size_t > &  variables_indices 
) const

Returns the inputs and target values of a single instance in the data set.

Parameters
instance_indexIndex of the instance.
variables_indicesIndices of the variables.

Definition at line 989 of file data_set.cpp.

◆ get_instances_submatrix_data()

Matrix< double > OpenNN::DataSet::get_instances_submatrix_data ( const Vector< size_t > &  instances_indices) const

Returns the submatrix of the data with the asked instances.

Parameters
instances_indicesIndices of the instances to return.

Definition at line 355 of file data_set.cpp.

◆ get_scaling_unscaling_method()

DataSet::ScalingUnscalingMethod OpenNN::DataSet::get_scaling_unscaling_method ( const std::string &  scaling_unscaling_method)
static

Returns a value of the scaling-unscaling method enumeration from a string containing the name of that method.

Parameters
scaling_unscaling_methodString with the name of the scaling and unscaling method.

Definition at line 707 of file data_set.cpp.

◆ get_time_series_data()

const Matrix< double > & OpenNN::DataSet::get_time_series_data ( void  ) const

Returns a reference to the time series data matrix in the data set. Only for time series problems.

Definition at line 345 of file data_set.cpp.

◆ get_tokens()

Vector< std::string > OpenNN::DataSet::get_tokens ( const std::string &  str) const

Splits the string into substrings (tokens) wherever separator occurs, and returns a vector with those strings. If separator does not match anywhere in the string, this method returns a single-element list containing this string.

Parameters
strString to be tokenized.

Definition at line 7281 of file data_set.cpp.

◆ get_trimmed()

std::string OpenNN::DataSet::get_trimmed ( const std::string &  str) const

Returns a string that has whitespace removed from the start and the end. This includes the ASCII characters "\t", "\n", "\v", "\f", "\r", and " ".

Parameters
strString to be checked.

Definition at line 7369 of file data_set.cpp.

◆ get_variable() [1/2]

Vector< double > OpenNN::DataSet::get_variable ( const size_t &  i) const

Returns all the instances of a single variable in the data set.

Parameters
iIndex of the variable.

Definition at line 1021 of file data_set.cpp.

◆ get_variable() [2/2]

Vector< double > OpenNN::DataSet::get_variable ( const size_t &  variable_index,
const Vector< size_t > &  instances_indices 
) const

Returns a given set of instances of a single variable in the data set.

Parameters
variable_indexIndex of the variable.
instances_indicesIndices of the instances.

Definition at line 1054 of file data_set.cpp.

◆ has_data()

bool OpenNN::DataSet::has_data ( void  ) const

Returns true if the data matrix is not empty (it has not been loaded), and false otherwise.

Definition at line 6824 of file data_set.cpp.

◆ initialize_data()

void OpenNN::DataSet::initialize_data ( const double &  new_value)

Initializes the data matrix with a given value.

Parameters
new_valueInitialization value.

Definition at line 3905 of file data_set.cpp.

◆ is_binary_variable()

bool OpenNN::DataSet::is_binary_variable ( const size_t &  variable_index) const

Returns true if the given variable is binary and false in other case.

Parameters
variable_indexIndex of the variable that is going to be checked.

Definition at line 297 of file data_set.cpp.

◆ is_mixed()

bool OpenNN::DataSet::is_mixed ( const Vector< std::string > &  v) const

Returns true if some the elements in a string list are numeric and some others are not numeric.

Parameters
vString list to be checked.

Definition at line 7444 of file data_set.cpp.

◆ is_not_numeric()

bool OpenNN::DataSet::is_not_numeric ( const Vector< std::string > &  v) const

Returns true if none element in a string list is numeric, and false otherwise.

Parameters
vString list to be checked.

Definition at line 7425 of file data_set.cpp.

◆ is_numeric() [1/2]

bool OpenNN::DataSet::is_numeric ( const std::string &  str) const

Returns true if the string passed as argument represents a number, and false otherwise.

Parameters
strString to be checked.

Definition at line 7326 of file data_set.cpp.

◆ is_numeric() [2/2]

bool OpenNN::DataSet::is_numeric ( const Vector< std::string > &  v) const

Returns true if all the elements in a string list are numeric, and false otherwise.

Parameters
vString list to be checked.

Definition at line 7406 of file data_set.cpp.

◆ load()

void OpenNN::DataSet::load ( const std::string &  file_name)

Loads the members of a data set object from a XML-type file:

  • Instances number.
  • Training instances number.
  • Training instances indices.
  • Selection instances number.
  • Selection instances indices.
  • Testing instances number.
  • Testing instances indices.
  • Input variables number.
  • Input variables indices.
  • Target variables number.
  • Target variables indices.
  • Input variables name.
  • Target variables name.
  • Input variables description.
  • Target variables description.
  • Display.
  • Data.

Please mind about the file format. This is specified in the User's Guide.

Parameters
file_nameName of data set XML-type file.

Definition at line 4718 of file data_set.cpp.

◆ operator=()

DataSet & OpenNN::DataSet::operator= ( const DataSet other_data_set)

Assignment operator. It assigns to the current object the members of an existing data set object.

Parameters
other_data_setData set object to be assigned.

Definition at line 141 of file data_set.cpp.

◆ operator==()

bool OpenNN::DataSet::operator== ( const DataSet other_data_set) const

Equal to operator. It compares this object with another object of the same class. It returns true if the members of the two objects have the same values, and false otherwise. @ param other_data_set Data set object to be compared with.

Definition at line 177 of file data_set.cpp.

◆ perform_principal_components_analysis() [1/2]

Matrix< double > OpenNN::DataSet::perform_principal_components_analysis ( const double &  minimum_explained_variance = 0.0)

Performs the principal components analysis of the inputs. It returns a matrix containing the principal components arranged in rows. This method deletes the unused instances of the original data set.

Parameters
minimum_explained_varianceMinimum percentage of variance used to select a principal component.

Definition at line 3019 of file data_set.cpp.

◆ perform_principal_components_analysis() [2/2]

Matrix< double > OpenNN::DataSet::perform_principal_components_analysis ( const Matrix< double > &  covariance_matrix,
const Vector< double > &  explained_variance,
const double &  minimum_explained_variance = 0.0 
)

Performs the principal components analysis of the inputs. It returns a matrix containing the principal components arranged in rows. This method deletes the unused instances of the original data set.

Parameters
covariance_matrixMatrix of covariances.
explained_varianceVector of the explained variances of the variables.
minimum_explained_varianceMinimum percentage of variance used to select a principal component.

Definition at line 3106 of file data_set.cpp.

◆ prepend()

std::string OpenNN::DataSet::prepend ( const std::string &  pre,
const std::string &  str 
) const

Prepends the string pre to the beginning of the string str and returns the whole string.

Parameters
preString to be prepended.
stroriginal string.

Definition at line 7391 of file data_set.cpp.

◆ print_data_preview()

void OpenNN::DataSet::print_data_preview ( void  ) const

Prints to the sceen a preview of the data matrix, i.e., the first, second and last instances

Definition at line 4755 of file data_set.cpp.

◆ randomize_data_normal()

void OpenNN::DataSet::randomize_data_normal ( const double &  mean = 0.0,
const double &  standard_deviation = 1.0 
)

Initializes the data matrix with random values chosen from a normal distribution with given mean and standard deviation.

Definition at line 3927 of file data_set.cpp.

◆ randomize_data_uniform()

void OpenNN::DataSet::randomize_data_uniform ( const double &  minimum = -1.0,
const double &  maximum = 1.0 
)

Initializes the data matrix with random values chosen from a uniform distribution with given minimum and maximum.

Definition at line 3916 of file data_set.cpp.

◆ read_instance()

void OpenNN::DataSet::read_instance ( const std::string &  line,
const Vector< Vector< std::string > > &  nominal_labels,
const size_t &  instance_index 
)
private

Sets the values of a single instance in the data matrix from a line in the data file.

Parameters
lineData file line.
nominal_labelsValues of all nominal variables in the data file.
instance_indexIndex of instance.

Definition at line 5068 of file data_set.cpp.

◆ save()

void OpenNN::DataSet::save ( const std::string &  file_name) const

Saves the members of a data set object to a XML-type file in an XML-type format.

Parameters
file_nameName of data set XML-type file.

Definition at line 4683 of file data_set.cpp.

◆ scale_data_mean_standard_deviation() [1/2]

void OpenNN::DataSet::scale_data_mean_standard_deviation ( const Vector< Statistics< double > > &  data_statistics)

Scales the data matrix with given mean and standard deviation values. It updates the data matrix.

Parameters
data_statisticsVector of statistics structures for all the variables in the data set. The size of that vector must be equal to the number of variables.

Definition at line 3220 of file data_set.cpp.

◆ scale_data_mean_standard_deviation() [2/2]

Vector< Statistics< double > > OpenNN::DataSet::scale_data_mean_standard_deviation ( void  )

Scales the data using the mean and standard deviation method, and the mean and standard deviation values calculated from the data matrix. It also returns the statistics from all columns.

Definition at line 3282 of file data_set.cpp.

◆ scale_data_minimum_maximum() [1/2]

void OpenNN::DataSet::scale_data_minimum_maximum ( const Vector< Statistics< double > > &  data_statistics)

Scales the data matrix with given minimum and maximum values. It updates the data matrix.

Parameters
data_statisticsVector of statistics structures for all the variables in the data set. The size of that vector must be equal to the number of variables.

Definition at line 3331 of file data_set.cpp.

◆ scale_data_minimum_maximum() [2/2]

Vector< Statistics< double > > OpenNN::DataSet::scale_data_minimum_maximum ( void  )

Scales the data using the minimum and maximum method, and the minimum and maximum values calculated from the data matrix. It also returns the statistics from all columns.

Definition at line 3266 of file data_set.cpp.

◆ scale_inputs() [1/2]

Vector< Statistics< double > > OpenNN::DataSet::scale_inputs ( const std::string &  scaling_unscaling_method)

Calculates the input and target variables statistics. Then it scales the input variables with that values. The method to be used is that in the scaling and unscaling method variable. Finally, it returns the statistics.

Definition at line 3553 of file data_set.cpp.

◆ scale_inputs() [2/2]

void OpenNN::DataSet::scale_inputs ( const std::string &  scaling_unscaling_method,
const Vector< Statistics< double > > &  inputs_statistics 
)

Calculates the input and target variables statistics. Then it scales the input variables with that values. The method to be used is that in the scaling and unscaling method variable.

Definition at line 3596 of file data_set.cpp.

◆ scale_inputs_mean_standard_deviation() [1/2]

void OpenNN::DataSet::scale_inputs_mean_standard_deviation ( const Vector< Statistics< double > > &  inputs_statistics)

Scales the input variables with given mean and standard deviation values. It updates the input variables of the data matrix.

Parameters
inputs_statisticsVector of statistics structures for the input variables. The size of that vector must be equal to the number of inputs.

Definition at line 3457 of file data_set.cpp.

◆ scale_inputs_mean_standard_deviation() [2/2]

Vector< Statistics< double > > OpenNN::DataSet::scale_inputs_mean_standard_deviation ( void  )

Scales the input variables with the calculated mean and standard deviation values from the data matrix. It updates the input variables of the data matrix. It also returns a vector of vectors with the variables statistics.

Definition at line 3471 of file data_set.cpp.

◆ scale_inputs_minimum_maximum() [1/2]

void OpenNN::DataSet::scale_inputs_minimum_maximum ( const Vector< Statistics< double > > &  inputs_statistics)

Scales the input variables with given minimum and maximum values. It updates the input variables of the data matrix.

Parameters
inputs_statisticsVector of statistics structures for all the inputs in the data set. The size of that vector must be equal to the number of input variables.

Definition at line 3505 of file data_set.cpp.

◆ scale_inputs_minimum_maximum() [2/2]

Vector< Statistics< double > > OpenNN::DataSet::scale_inputs_minimum_maximum ( void  )

Scales the input variables with the calculated minimum and maximum values from the data matrix. It updates the input variables of the data matrix. It also returns a vector of vectors with the minimum and maximum values of the input variables.

Definition at line 3519 of file data_set.cpp.

◆ scale_targets() [1/2]

Vector< Statistics< double > > OpenNN::DataSet::scale_targets ( const std::string &  scaling_unscaling_method)

Calculates the input and target variables statistics. Then it scales the target variables with that values. The method to be used is that in the scaling and unscaling method variable. Finally, it returns the statistics.

Definition at line 3736 of file data_set.cpp.

◆ scale_targets() [2/2]

void OpenNN::DataSet::scale_targets ( const std::string &  scaling_unscaling_method,
const Vector< Statistics< double > > &  targets_statistics 
)

It scales the input variables with that values. The method to be used is that in the scaling and unscaling method variable.

Definition at line 3778 of file data_set.cpp.

◆ scale_targets_mean_standard_deviation() [1/2]

void OpenNN::DataSet::scale_targets_mean_standard_deviation ( const Vector< Statistics< double > > &  targets_statistics)

Scales the target variables with given mean and standard deviation values. It updates the target variables of the data matrix.

Parameters
targets_statisticsVector of statistics structures for all the targets in the data set. The size of that vector must be equal to the number of target variables.

Definition at line 3640 of file data_set.cpp.

◆ scale_targets_mean_standard_deviation() [2/2]

Vector< Statistics< double > > OpenNN::DataSet::scale_targets_mean_standard_deviation ( void  )

Scales the target variables with the calculated mean and standard deviation values from the data matrix. It updates the target variables of the data matrix. It also returns a vector of statistics structures with the basic statistics of all the variables.

Definition at line 3654 of file data_set.cpp.

◆ scale_targets_minimum_maximum() [1/2]

void OpenNN::DataSet::scale_targets_minimum_maximum ( const Vector< Statistics< double > > &  targets_statistics)

Scales the target variables with given minimum and maximum values. It updates the target variables of the data matrix.

Parameters
targets_statisticsVector of statistics structures for all the targets in the data set. The size of that vector must be equal to the number of target variables.

Definition at line 3688 of file data_set.cpp.

◆ scale_targets_minimum_maximum() [2/2]

Vector< Statistics< double > > OpenNN::DataSet::scale_targets_minimum_maximum ( void  )

Scales the target variables with the calculated minimum and maximum values from the data matrix. It updates the target variables of the data matrix. It also returns a vector of vectors with the statistics of the input target variables.

Definition at line 3719 of file data_set.cpp.

◆ scrub_missing_values()

void OpenNN::DataSet::scrub_missing_values ( void  )

General method for dealing with missing values. It switches among the different scrubbing methods available, according to the corresponding value in the missing values object.

Definition at line 7191 of file data_set.cpp.

◆ set() [1/6]

void OpenNN::DataSet::set ( const Matrix< double > &  new_data)

Sets all variables from a data matrix.

Parameters
new_dataData matrix.

Definition at line 1110 of file data_set.cpp.

◆ set() [2/6]

void OpenNN::DataSet::set ( const size_t &  new_instances_number,
const size_t &  new_variables_number 
)

Sets new numbers of instances and variables in the inputs targets data set. All the instances are set for training. All the variables are set as inputs.

Parameters
new_instances_numberNumber of instances.
new_variables_numberNumber of variables.

Definition at line 1135 of file data_set.cpp.

◆ set() [3/6]

void OpenNN::DataSet::set ( const size_t &  new_instances_number,
const size_t &  new_inputs_number,
const size_t &  new_targets_number 
)

Sets new numbers of instances and inputs and target variables in the data set. The variables in the data set are the number of inputs plus the number of targets.

Parameters
new_instances_numberNumber of instances.
new_inputs_numberNumber of input variables.
new_targets_numberNumber of target variables.

Definition at line 1187 of file data_set.cpp.

◆ set() [4/6]

void OpenNN::DataSet::set ( const DataSet other_data_set)

Sets the members of this data set object with those from another data set object.

Parameters
other_data_setData set object to be copied.

Definition at line 1212 of file data_set.cpp.

◆ set() [5/6]

void OpenNN::DataSet::set ( const tinyxml2::XMLDocument &  data_set_document)

Sets the data set members from a XML document.

Parameters
data_set_documentTinyXML document containing the member data.

Definition at line 1241 of file data_set.cpp.

◆ set() [6/6]

void OpenNN::DataSet::set ( const std::string &  file_name)

Sets the data set members by loading them from a XML file.

Parameters
file_nameData set XML file_name.

Definition at line 1254 of file data_set.cpp.

◆ set_angular_variables()

void OpenNN::DataSet::set_angular_variables ( const Vector< size_t > &  new_angular_variables)

Sets the indices of those variables which represent angles.

Parameters
new_angular_variablesIndices of angular variables.

Definition at line 1822 of file data_set.cpp.

◆ set_autoassociation()

void OpenNN::DataSet::set_autoassociation ( const bool &  new_autoassociation)

Sets a new autoasociation flag. If the new value is true, the data will be processed for association when loading. That is, the data file will contain the input data. The target data will be created as being equal to the input data. If the association value is set to false, the data from the file will not be processed.

Parameters
new_autoassociationAssociation value.

Definition at line 1764 of file data_set.cpp.

◆ set_data()

void OpenNN::DataSet::set_data ( const Matrix< double > &  new_data)

Sets a new data matrix. The number of rows must be equal to the number of instances. The number of columns must be equal to the number of variables. Indices of all training, selection and testing instances and inputs and target variables do not change.

Parameters
new_dataData matrix.

Definition at line 1524 of file data_set.cpp.

◆ set_data_file_name()

void OpenNN::DataSet::set_data_file_name ( const std::string &  new_data_file_name)

Sets the name of the data file. It also loads the data from that file. Moreover, it sets the variables and instances objects.

Parameters
new_data_file_nameName of the file containing the data.

Definition at line 1577 of file data_set.cpp.

◆ set_default()

void OpenNN::DataSet::set_default ( void  )

Sets the default member values:

  • Display: True.

Definition at line 1280 of file data_set.cpp.

◆ set_display()

void OpenNN::DataSet::set_display ( const bool &  new_display)

Sets a new display value. If it is set to true messages from this class are to be displayed on the screen; if it is set to false messages from this class are not to be displayed on the screen.

Parameters
new_displayDisplay value.

Definition at line 1267 of file data_set.cpp.

◆ set_from_data_file()

Vector< Vector< std::string > > OpenNN::DataSet::set_from_data_file ( void  )
private

Performs a first data file read in which the format is checked, and the numbers of variables, instances and missing values are set.

Definition at line 5202 of file data_set.cpp.

◆ set_instance()

void OpenNN::DataSet::set_instance ( const size_t &  instance_index,
const Vector< double > &  instance 
)

Sets new inputs and target values of a single instance in the data set.

Parameters
instance_indexIndex of the instance.
instanceNew inputs and target values of the instance.

Definition at line 1879 of file data_set.cpp.

◆ set_instances_number()

void OpenNN::DataSet::set_instances_number ( const size_t &  new_instances_number)

Sets a new number of instances in the data set. All instances are also set for training. The indices of the inputs and target variables do not change.

Parameters
new_instances_numberNumber of instances.

Definition at line 1846 of file data_set.cpp.

◆ set_lags_number()

void OpenNN::DataSet::set_lags_number ( const size_t &  new_lags_number)

Sets a new number of lags to be defined for a time series prediction application. When loading the data file, the time series data will be modified according to this number.

Parameters
new_lags_numberNumber of lags (x-1, ..., x-l) to be used.

Definition at line 1738 of file data_set.cpp.

◆ set_learning_task() [1/2]

void OpenNN::DataSet::set_learning_task ( const ProjectType new_learning_task)

Sets a new project type.

Parameters
new_learning_taskNew project type.

Definition at line 1775 of file data_set.cpp.

◆ set_learning_task() [2/2]

void OpenNN::DataSet::set_learning_task ( const std::string &  new_learning_task)

Sets a new project type from a string.

Parameters
new_learning_taskNew project type.

Definition at line 1786 of file data_set.cpp.

◆ set_missing_values_label()

void OpenNN::DataSet::set_missing_values_label ( const std::string &  new_missing_values_label)

Sets a new label for the missing values.

Parameters
new_missing_values_labelLabel for the missing values.

Definition at line 1708 of file data_set.cpp.

◆ set_MPI()

void OpenNN::DataSet::set_MPI ( const DataSet data_set)

Send the DataSet to all the processors using MPI.

Parameters
data_setOriginal DataSet object, initialized by processor 0.

Definition at line 1308 of file data_set.cpp.

◆ set_separator() [1/2]

void OpenNN::DataSet::set_separator ( const Separator new_separator)

Sets a new separator.

Parameters
new_separatorSeparator value.

Definition at line 1661 of file data_set.cpp.

◆ set_separator() [2/2]

void OpenNN::DataSet::set_separator ( const std::string &  new_separator)

Sets a new separator from a string.

Parameters
new_separatorString with the separator value.

Definition at line 1672 of file data_set.cpp.

◆ set_steps_ahead_number()

void OpenNN::DataSet::set_steps_ahead_number ( const size_t &  new_steps_ahead_number)

Sets a new number of steps ahead to be defined for a time series prediction application. When loading the data file, the time series data will be modified according to this number.

Parameters
new_steps_ahead_numberNumber of steps ahead to be used.

Definition at line 1750 of file data_set.cpp.

◆ set_variables_number()

void OpenNN::DataSet::set_variables_number ( const size_t &  new_variables_number)

Sets a new number of input variables in the data set. The indices of the training, selection and testing instances do not change. All variables are set as inputs.

Parameters
new_variables_numberNumber of variables.

Definition at line 1863 of file data_set.cpp.

◆ subtract_instance()

void OpenNN::DataSet::subtract_instance ( const size_t &  instance_index)

Substracts the inputs-targets instance with a given index from the data set. All instances are also set for training. Note that resizing is here necessary and therefore computationally expensive.

Parameters
instance_indexIndex of instance to be removed.

Definition at line 1965 of file data_set.cpp.

◆ subtract_variable()

void OpenNN::DataSet::subtract_variable ( const size_t &  variable_index)

Removes a variable with given index from the data matrix.

Parameters
variable_indexIndex of variable to be subtracted.

Definition at line 2039 of file data_set.cpp.

◆ sum_binary_inputs()

void OpenNN::DataSet::sum_binary_inputs ( void  )
Todo:

Definition at line 6263 of file data_set.cpp.

◆ transform_principal_components_data()

void OpenNN::DataSet::transform_principal_components_data ( const Matrix< double > &  principal_components)

Transforms the data according to the principal components.

Parameters
principal_componentsMatrix containing the principal components.

Definition at line 3177 of file data_set.cpp.

◆ trim()

void OpenNN::DataSet::trim ( std::string &  str) const

Removes whitespaces from the start and the end of the string passed as argument. This includes the ASCII characters "\t", "\n", "\v", "\f", "\r", and " ".

Parameters
strString to be checked.

Definition at line 7353 of file data_set.cpp.

◆ unscale_data_mean_standard_deviation()

void OpenNN::DataSet::unscale_data_mean_standard_deviation ( const Vector< Statistics< double > > &  data_statistics)

Unscales the data matrix with given mean and standard deviation values. It updates the data matrix.

Parameters
data_statisticsVector of statistics structures for all the variables in the data set. The size of that vector must be equal to the number of variables.

Definition at line 3821 of file data_set.cpp.

◆ unscale_data_minimum_maximum()

void OpenNN::DataSet::unscale_data_minimum_maximum ( const Vector< Statistics< double > > &  data_statistics)

Unscales the data matrix with given minimum and maximum values. It updates the data matrix.

Parameters
data_statisticsVector of statistics structures for all the variables in the data set. The size of that vector must be equal to the number of variables.

Definition at line 3834 of file data_set.cpp.

◆ unscale_inputs_mean_standard_deviation()

void OpenNN::DataSet::unscale_inputs_mean_standard_deviation ( const Vector< Statistics< double > > &  data_statistics)

Unscales the input variables with given mean and standard deviation values. It updates the input variables of the data matrix.

Parameters
data_statisticsVector of statistics structures for all the variables in the data set. The size of that vector must be equal to the number of variables.

Definition at line 3847 of file data_set.cpp.

◆ unscale_inputs_minimum_maximum()

void OpenNN::DataSet::unscale_inputs_minimum_maximum ( const Vector< Statistics< double > > &  data_statistics)

Unscales the input variables with given minimum and maximum values. It updates the input variables of the data matrix.

Parameters
data_statisticsVector of statistics structures for all the data in the data set. The size of that vector must be equal to the number of variables.

Definition at line 3862 of file data_set.cpp.

◆ unscale_targets_mean_standard_deviation()

void OpenNN::DataSet::unscale_targets_mean_standard_deviation ( const Vector< Statistics< double > > &  data_statistics)

Unscales the target variables with given mean and standard deviation values. It updates the target variables of the data matrix.

Parameters
data_statisticsVector of statistics structures for all the variables in the data set. The size of that vector must be equal to the number of variables.

Definition at line 3877 of file data_set.cpp.

◆ unscale_targets_minimum_maximum()

void OpenNN::DataSet::unscale_targets_minimum_maximum ( const Vector< Statistics< double > > &  data_statistics)

Unscales the target variables with given minimum and maximum values. It updates the target variables of the data matrix.

Parameters
data_statisticsVector of statistics structures for all the variables. The size of that vector must be equal to the number of variables.

Definition at line 3892 of file data_set.cpp.

◆ unuse_constant_variables()

Vector< size_t > OpenNN::DataSet::unuse_constant_variables ( void  )

Removes the input of target indices of that variables with zero standard deviation. It might change the size of the vectors containing the inputs and targets indices.

Definition at line 2077 of file data_set.cpp.

◆ unuse_most_populated_target()

Vector< size_t > OpenNN::DataSet::unuse_most_populated_target ( const size_t &  instances_to_unuse)

This method unuses a given number of instances of the most populated target. If the given number is greater than the number of used instances which belongs to that target, it unuses all the instances in that target. If the given number is lower than 1, it unuses 1 instance.

Parameters
instances_to_unuseNumber of instances to set unused.

Definition at line 6062 of file data_set.cpp.

◆ unuse_repeated_instances()

Vector< size_t > OpenNN::DataSet::unuse_repeated_instances ( void  )

Removes the training, selection and testing indices of that instances which are repeated in the data matrix. It might change the size of the vectors containing the training, selection and testing indices.

Definition at line 2120 of file data_set.cpp.

Member Data Documentation

◆ data

Matrix<double> OpenNN::DataSet::data
private

Data Matrix. The number of rows is the number of instances. The number of columns is the number of variables.

Definition at line 572 of file data_set.h.

◆ time_series_data

Matrix<double> OpenNN::DataSet::time_series_data
private

Time series data matrix. The number of rows is the number of instances before time series changes. The number of columns is the number of variables before tim series changes.

Definition at line 578 of file data_set.h.


The documentation for this class was generated from the following files: