OpenNN  2.2
Open Neural Networks Library
data_set.h
1 /****************************************************************************************************************/
2 /* */
3 /* OpenNN: Open Neural Networks Library */
4 /* www.opennn.net */
5 /* */
6 /* D A T A S E T C L A S S H E A D E R */
7 /* */
8 /* Roberto Lopez */
9 /* Artelnics - Making intelligent use of data */
10 /* robertolopez@artelnics.com */
11 /* */
12 /****************************************************************************************************************/
13 
14 #ifndef __DATASET_H__
15 #define __DATASET_H__
16 
17 // System includes
18 
19 #include <iostream>
20 #include <fstream>
21 #include <string>
22 #include <sstream>
23 #include <cmath>
24 #include <algorithm>
25 #include <cstdlib>
26 #include <stdexcept>
27 #include <ctime>
28 #include <exception>
29 
30 #ifdef __OPENNN_MPI__
31 #include <mpi.h>
32 #endif
33 // OpenNN includes
34 
35 #include "vector.h"
36 #include "matrix.h"
37 
38 #include "missing_values.h"
39 #include "variables.h"
40 #include "instances.h"
41 
42 // TinyXml includes
43 
44 #include "../tinyxml2/tinyxml2.h"
45 
46 namespace OpenNN
47 {
48 
52 
53 class DataSet
54 {
55 
56 public:
57 
58  // DEFAULT CONSTRUCTOR
59 
60  explicit DataSet(void);
61 
62  // DATA CONSTRUCTOR
63 
64  explicit DataSet(const Matrix<double>&);
65 
66  // INSTANCES AND VARIABLES CONSTRUCTOR
67 
68  explicit DataSet(const size_t&, const size_t&);
69 
70  // INSTANCES, INPUTS AND TARGETS CONSTRUCTOR
71 
72  explicit DataSet(const size_t&, const size_t&, const size_t&);
73 
74  // XML CONSTRUCTOR
75 
76  explicit DataSet(const tinyxml2::XMLDocument&);
77 
78  // FILE CONSTRUCTOR
79 
80  explicit DataSet(const std::string&);
81 
82  // COPY CONSTRUCTOR
83 
84  DataSet(const DataSet&);
85 
86  // DESTRUCTOR
87 
88  virtual ~DataSet(void);
89 
90  // ASSIGNMENT OPERATOR
91 
92  DataSet& operator = (const DataSet&);
93 
94  // EQUAL TO OPERATOR
95 
96  bool operator == (const DataSet&) const;
97 
98  // ENUMERATIONS
99 
101 
102  enum Separator{Space, Tab, Comma, Semicolon};
103 
105 
106  enum ScalingUnscalingMethod{NoScaling, NoUnscaling, MinimumMaximum, MeanStandardDeviation};
107 
109 
110  enum AngularUnits{Radians, Degrees};
111 
113 
114  enum FileType{TXT, DAT, DATA, CSV, ODS, XLSX, ARFF};
115 
117 
118  enum ProjectType{Approximation, Classification, Forecasting, Association};
119 
120  // METHODS
121 
122  // Get methods
123 
124  FileType get_file_type(void) const;
125  std::string write_file_type(void) const;
126 
127  std::string write_first_cell(void) const;
128  std::string write_last_cell(void) const;
129  size_t write_sheet_number(void) const;
130 
131  ProjectType get_learning_task(void) const;
132  std::string write_learning_task(void) const;
133 
134  const std::string& get_data_file_name(void) const;
135 
136  const bool& get_header_line(void) const;
137  const bool& get_rows_label(void) const;
138 
139  const Separator& get_separator(void) const;
140  std::string get_separator_string(void) const;
141  std::string write_separator(void) const;
142 
143  const std::string& get_missing_values_label(void) const;
144 
145  const size_t& get_lags_number(void) const;
146  const size_t& get_steps_ahead(void) const;
147 
148  const bool& get_autoassociation(void) const;
149 
150  const Vector<size_t>& get_angular_variables(void) const;
151  const AngularUnits& get_angular_units(void) const;
152 
153  static ScalingUnscalingMethod get_scaling_unscaling_method(const std::string&);
154 
155  const MissingValues& get_missing_values(void) const;
157 
158  const Variables& get_variables(void) const;
160 
161  const Instances& get_instances(void) const;
163 
164  const bool& get_display(void) const;
165 
166  bool is_binary_classification(void) const;
167  bool is_multiple_classification(void) const;
168 
169  bool is_binary_variable(const size_t&) const;
170 
171  // Data methods
172 
173  bool empty(void) const;
174 
175  const Matrix<double>& get_data(void) const;
176  const Matrix<double>& get_time_series_data(void) const;
177 
179 
183 
186 
189 
196 
197  // Instance methods
198 
199  Vector<double> get_instance(const size_t&) const;
200  Vector<double> get_instance(const size_t&, const Vector<size_t>&) const;
201 
202  // Variable methods
203 
204  Vector<double> get_variable(const size_t&) const;
205  Vector<double> get_variable(const size_t&, const Vector<size_t>&) const;
206 
207  // Set methods
208 
209  void set(void);
210  void set(const Matrix<double>&);
211  void set(const size_t&, const size_t&);
212  void set(const size_t&, const size_t&, const size_t&);
213  void set(const DataSet&);
214  void set(const tinyxml2::XMLDocument&);
215  void set(const std::string&);
216 
217  // Data methods
218 
219  void set_data(const Matrix<double>&);
220 
221  void set_instances_number(const size_t&);
222  void set_variables_number(const size_t&);
223 
224  void set_data_file_name(const std::string&);
225 
226  void set_file_type(const FileType&);
227  void set_file_type(const std::string&);
228 
229  void set_header_line(const bool&);
230  void set_rows_label(const bool&);
231 
232  void set_separator(const Separator&);
233  void set_separator(const std::string&);
234 
235  void set_missing_values_label(const std::string&);
236 
237  void set_lags_number(const size_t&);
238  void set_steps_ahead_number(const size_t&);
239 
240  void set_autoassociation(const bool&);
241 
242  void set_learning_task(const ProjectType&);
243  void set_learning_task(const std::string&);
244 
247 
248  // Utilities
249 
250  void set_display(const bool&);
251 
252  void set_default(void);
253 
254  void set_MPI(const DataSet*);
255 
256  // Instance methods
257 
258  void set_instance(const size_t&, const Vector<double>&);
259 
260  // Data resizing methods
261 
262  void add_instance(const Vector<double>&);
263  void subtract_instance(const size_t&);
264 
265  void append_variable(const Vector<double>&);
266  void subtract_variable(const size_t&);
267 
270 
272 
273  // Initialization methods
274 
275  void initialize_data(const double&);
276 
277  void randomize_data_uniform(const double& minimum = -1.0, const double& maximum = 1.0);
278  void randomize_data_normal(const double& mean = 0.0, const double& standard_deviation = 1.0);
279 
280  // Statistics methods
281 
283 
285 
287 
290 
292 
296 
300 
303 
307 
308  // Correlation methods
309 
311 
312  // Principal components mehtod
313 
315 
319 
320  void subtract_input_data_mean(void);
321 
322  // Histrogram methods
323 
324  Vector< Histogram<double> > calculate_data_histograms(const size_t& = 10) const;
325 
326  Vector< Histogram<double> > calculate_targets_histograms(const size_t& = 10) const;
327 
328  // Box and whiskers
329 
331 
332  size_t calculate_training_negatives(const size_t&) const;
333  size_t calculate_selection_negatives(const size_t&) const;
334  size_t calculate_testing_negatives(const size_t&) const;
335 
336  // Filtering methods
337 
339 
340  // Data scaling
341 
344 
347 /*
348  void scale_data(const std::string&, const Vector< Statistics<double> >&);
349 
350  Vector< Statistics<double> > scale_data(const std::string&);
351 */
352  // Input variables scaling
353 
356 
359 
360  Vector< Statistics<double> > scale_inputs(const std::string&);
361  void scale_inputs(const std::string&, const Vector< Statistics<double> >&);
362 
363  // Target variables scaling
364 
367 
370 
371  Vector< Statistics<double> > scale_targets(const std::string&);
372  void scale_targets(const std::string&, const Vector< Statistics<double> >&);
373 
374  // Data unscaling
375 
378 
379  // Input variables unscaling
380 
383 
384  // Target variables unscaling
385 
388 
389  // Classification methods
390 
392 
394 
397 
399 
401 
404 
405  void sum_binary_inputs(void);
406 
407  // Outlier detection
408 
409  Matrix<double> calculate_instances_distances(const size_t&) const;
410  Matrix<size_t> calculate_nearest_neighbors(const Matrix<double>&, const size_t&) const;
411  Vector<double> calculate_k_distances(const Matrix<double>&, const size_t&) const;
413  Vector<double> calculate_reachability_density(const Matrix<double>&, const size_t&) const;
414  Vector<double> calculate_local_outlier_factor(const size_t& = 5) const;
415 
416  Vector<size_t> clean_local_outlier_factor(const size_t& = 5);
417 
418  Vector<size_t> calculate_Tukey_outliers(const size_t&, const double& = 1.5) const;
419 
420  Vector< Vector<size_t> > calculate_Tukey_outliers(const double& = 1.5) const;
421 
422  // Time series methods
423 
424  Matrix<double> calculate_autocorrelation(const size_t& = 10) const;
426 
427  // Data generation
428 
429  void generate_data_approximation(const size_t&, const size_t&);
430 
431  void generate_data_binary_classification(const size_t&, const size_t&);
432  void generate_data_multiple_classification(const size_t&, const size_t&);
433 
434  // Serialization methods
435 
436  std::string to_string(void) const;
437 
438  void print(void) const;
439  void print_summary(void) const;
440 
441  tinyxml2::XMLDocument* to_XML(void) const;
442  void from_XML(const tinyxml2::XMLDocument&);
443 
444  void write_XML(tinyxml2::XMLPrinter&) const;
445  //void read_XML( );
446 
447  void save(const std::string&) const;
448  void load(const std::string&);
449 
450  void print_data(void) const;
451  void print_data_preview(void) const;
452 
453  void save_data(void) const;
454 
455  bool has_data(void) const;
456 
457  // Data load methods
458 
459  void load_data(void);
460  void load_data_binary(void);
461  void load_time_series_data_binary(void);
462 
464 
466 
467  void convert_time_series(void);
468  void convert_association(void);
469 
470  void convert_angular_variable_degrees(const size_t&);
471  void convert_angular_variable_radians(const size_t&);
472 
475 
476  void convert_angular_variables(void);
477 
478  // Missing values
479 
480  void scrub_missing_values_unuse(void);
481  void scrub_missing_values_mean(void);
482  void scrub_missing_values(void);
483 
484  // String utilities
485 
486  size_t count_tokens(std::string&) const;
487 
488  Vector<std::string> get_tokens(const std::string&) const;
489 
490  bool is_numeric(const std::string&) const;
491 
492  void trim(std::string&) const;
493 
494  std::string get_trimmed(const std::string&) const;
495 
496  std::string prepend(const std::string&, const std::string&) const;
497 
498  // Vector string utilities
499 
500  bool is_numeric(const Vector<std::string>&) const;
501  bool is_not_numeric(const Vector<std::string>&) const;
502  bool is_mixed(const Vector<std::string>&) const;
503 
504 private:
505 
506  // MEMBERS
507 
509 
511 
513 
514  std::string first_cell;
515 
517 
518  std::string last_cell;
519 
521 
522  size_t sheet_number;
523 
525 
526  std::string data_file_name;
527 
529 
531 
533 
535 
537 
539 
541 
542  std::string missing_values_label;
543 
545 
546  size_t lags_number;
547 
549 
550  size_t steps_ahead;
551 
553 
555 
557 
559 
561 
563 
565 
567 
571 
573 
577 
579 
581 
583 
585 
587 
589 
591 
593 
594  bool display;
595 
596  // METHODS
597 
598  size_t get_column_index(const Vector< Vector<std::string> >&, const size_t) const;
599 
600  void check_separator(const std::string&) const;
601 
602  size_t count_data_file_columns_number(void) const;
603  void check_header_line(void);
605 
606  void read_instance(const std::string&, const Vector< Vector<std::string> >&, const size_t&);
607 
610 
611 };
612 
613 }
614 
615 #endif
616 
617 // OpenNN: Open Neural Networks Library.
618 // Copyright (c) 2005-2016 Roberto Lopez.
619 //
620 // This library is free software; you can redistribute it and/or
621 // modify it under the terms of the GNU Lesser General Public
622 // License as published by the Free Software Foundation; either
623 // version 2.1 of the License, or any later version.
624 //
625 // This library is distributed in the hope that it will be useful,
626 // but WITHOUT ANY WARRANTY; without even the implied warranty of
627 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
628 // Lesser General Public License for more details.
629 
630 // You should have received a copy of the GNU Lesser General Public
631 // License along with this library; if not, write to the Free Software
632 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
Vector< std::string > arrange_association_names(const Vector< std::string > &) const
Definition: data_set.cpp:5468
Vector< Statistics< double > > calculate_inputs_statistics(void) const
Definition: data_set.cpp:2828
void unscale_inputs_minimum_maximum(const Vector< Statistics< double > > &)
Definition: data_set.cpp:3862
size_t count_tokens(std::string &) const
Definition: data_set.cpp:7230
bool has_data(void) const
Definition: data_set.cpp:6824
Matrix< double > arrange_training_target_data(void) const
Definition: data_set.cpp:877
void from_XML(const tinyxml2::XMLDocument &)
Definition: data_set.cpp:4296
Vector< double > calculate_k_distances(const Matrix< double > &, const size_t &) const
Definition: data_set.cpp:6375
void randomize_data_normal(const double &mean=0.0, const double &standard_deviation=1.0)
Definition: data_set.cpp:3927
Vector< Statistics< double > > calculate_targets_statistics(void) const
Definition: data_set.cpp:2859
Separator
Enumeration of available separators for the data file.
Definition: data_set.h:102
Matrix< double > calculate_reachability_distances(const Matrix< double > &, const Vector< double > &) const
Definition: data_set.cpp:6402
const MissingValues & get_missing_values(void) const
Returns a reference to the missing values object in the data set.
Definition: data_set.cpp:489
Variables * get_variables_pointer(void)
Returns a pointer to the variables object composing this data set object.
Definition: data_set.cpp:210
Vector< double > get_instance(const size_t &) const
Definition: data_set.cpp:956
Vector< Vector< std::string > > set_from_data_file(void)
Definition: data_set.cpp:5202
bool is_numeric(const std::string &) const
Definition: data_set.cpp:7326
Matrix< double > arrange_testing_target_data(void) const
Definition: data_set.cpp:941
std::string to_string(void) const
Returns a string representation of the current data set object.
Definition: data_set.cpp:4626
void unscale_data_minimum_maximum(const Vector< Statistics< double > > &)
Definition: data_set.cpp:3834
const size_t & get_steps_ahead(void) const
Returns the number of steps ahead to be used in a time series prediction application.
Definition: data_set.cpp:663
void set_data(const Matrix< double > &)
Definition: data_set.cpp:1524
Matrix< double > time_series_data
Definition: data_set.h:578
void scrub_missing_values_mean(void)
Substitutes all the missing values by the mean of the corresponding variable.
Definition: data_set.cpp:7161
Matrix< double > calculate_negatives_data_statistics_matrix(void) const
Calculate the statistics of the instances with neagtive targets in binary classification problems...
Definition: data_set.cpp:2601
FileType get_file_type(void) const
Returns the file type.
Definition: data_set.cpp:365
void add_instance(const Vector< double > &)
Definition: data_set.cpp:1928
const size_t & get_lags_number(void) const
Returns the number of lags to be used in a time series prediction application.
Definition: data_set.cpp:653
Vector< Histogram< double > > calculate_data_histograms(const size_t &=10) const
Definition: data_set.cpp:2236
Matrix< double > arrange_training_input_data(void) const
Definition: data_set.cpp:861
Vector< size_t > unuse_non_significant_inputs(void)
Unuses those binary inputs whose positives does not correspond to any positive in the target variable...
Definition: data_set.cpp:2175
void convert_association(void)
Definition: data_set.cpp:5503
void set_default(void)
Definition: data_set.cpp:1280
void unscale_data_mean_standard_deviation(const Vector< Statistics< double > > &)
Definition: data_set.cpp:3821
void convert_angular_variable_degrees(const size_t &)
Definition: data_set.cpp:6923
void set_file_type(const FileType &)
Sets the file type.
Definition: data_set.cpp:1587
ProjectType learning_task
Project type.
Definition: data_set.h:558
Vector< size_t > angular_variables
Indices of angular variables.
Definition: data_set.h:562
void set_data_file_name(const std::string &)
Definition: data_set.cpp:1577
Matrix< double > calculate_positives_data_statistics_matrix(void) const
Calculate the statistics of the instances with positive targets in binary classification problems...
Definition: data_set.cpp:2536
void read_instance(const std::string &, const Vector< Vector< std::string > > &, const size_t &)
Definition: data_set.cpp:5068
std::string data_file_name
Data file name.
Definition: data_set.h:526
FileType
Enumeration of the file types.
Definition: data_set.h:114
void set_learning_task(const ProjectType &)
Definition: data_set.cpp:1775
Vector< size_t > balance_multiple_targets_distribution(void)
Definition: data_set.cpp:5949
Vector< Vector< double > > calculate_training_instances_shape_parameters(void) const
Definition: data_set.cpp:2768
Vector< size_t > calculate_target_distribution(void) const
Definition: data_set.cpp:5775
bool rows_label
Header wihch contains the rows label.
Definition: data_set.h:534
size_t count_data_file_columns_number(void) const
Definition: data_set.cpp:4901
std::string first_cell
First cell.
Definition: data_set.h:514
void set_steps_ahead_number(const size_t &)
Definition: data_set.cpp:1750
Vector< size_t > unuse_repeated_instances(void)
Definition: data_set.cpp:2120
void unscale_inputs_mean_standard_deviation(const Vector< Statistics< double > > &)
Definition: data_set.cpp:3847
tinyxml2::XMLDocument * to_XML(void) const
Serializes the data set object into a XML document of the TinyXML library.
Definition: data_set.cpp:3937
Matrix< double > calculate_covariance_matrix(void) const
Definition: data_set.cpp:2980
AngularUnits angular_units
Units of angular variables.
Definition: data_set.h:566
const bool & get_header_line(void) const
Returns true if the first line of the data file has a header with the names of the variables...
Definition: data_set.cpp:519
void append_variable(const Vector< double > &)
Definition: data_set.cpp:1998
Matrix< double > arrange_input_data(void) const
Definition: data_set.cpp:797
bool is_mixed(const Vector< std::string > &) const
Definition: data_set.cpp:7444
void initialize_data(const double &)
Definition: data_set.cpp:3905
Vector< double > calculate_distances(void) const
Definition: data_set.cpp:5859
void set_missing_values_label(const std::string &)
Definition: data_set.cpp:1708
const bool & get_autoassociation(void) const
Definition: data_set.cpp:674
size_t write_sheet_number(void) const
Returns a string with the sheet number for excel files.
Definition: data_set.cpp:435
void subtract_input_data_mean(void)
Subtracts off the mean to every of the input variables.
Definition: data_set.cpp:3296
Matrix< double > arrange_selection_data(void) const
Definition: data_set.cpp:762
void print(void) const
Prints to the screen in text format the members of the data set object.
Definition: data_set.cpp:4649
Vector< size_t > calculate_Tukey_outliers(const size_t &, const double &=1.5) const
Definition: data_set.cpp:6524
Vector< double > calculate_reachability_density(const Matrix< double > &, const size_t &) const
Definition: data_set.cpp:6435
bool association
Association flag.
Definition: data_set.h:554
ScalingUnscalingMethod
Enumeration of available methods for scaling and unscaling the data.
Definition: data_set.h:106
void set_angular_variables(const Vector< size_t > &)
Definition: data_set.cpp:1822
size_t calculate_training_negatives(const size_t &) const
Definition: data_set.cpp:2353
const bool & get_rows_label(void) const
Returns true if the data file has rows label, and false otherwise.
Definition: data_set.cpp:529
void generate_data_approximation(const size_t &, const size_t &)
Definition: data_set.cpp:6717
bool operator==(const DataSet &) const
Definition: data_set.cpp:177
std::string get_separator_string(void) const
Returns the string which will be used as separator in the data file.
Definition: data_set.cpp:549
void transform_principal_components_data(const Matrix< double > &)
Definition: data_set.cpp:3177
const Matrix< double > & get_data(void) const
Definition: data_set.cpp:334
void scrub_missing_values_unuse(void)
Sets all the instances with missing values to "Unused".
Definition: data_set.cpp:7146
const bool & get_display(void) const
Definition: data_set.cpp:241
Matrix< double > calculate_data_shape_parameters_matrix(void) const
Definition: data_set.cpp:2668
Vector< Statistics< double > > scale_inputs_mean_standard_deviation(void)
Definition: data_set.cpp:3471
Vector< double > get_variable(const size_t &) const
Definition: data_set.cpp:1021
Vector< Statistics< double > > scale_targets(const std::string &)
Definition: data_set.cpp:3736
MissingValues * get_missing_values_pointer(void)
Returns a pointer to the missing values object in the data set.
Definition: data_set.cpp:499
void set_rows_label(const bool &)
Sets if the data file contains rows label.
Definition: data_set.cpp:1650
Matrix< double > arrange_used_target_data(void) const
Definition: data_set.cpp:846
Vector< Vector< double > > calculate_selection_instances_shape_parameters(void) const
Definition: data_set.cpp:2787
Vector< size_t > arrange_binary_inputs_indices(void) const
Returns a vector with the indices of the inputs that are binary.
Definition: data_set.cpp:6215
Matrix< double > arrange_selection_input_data(void) const
Definition: data_set.cpp:893
bool is_binary_classification(void) const
Returns true if the data set is a binary classification problem, false otherwise. ...
Definition: data_set.cpp:251
const Instances & get_instances(void) const
Returns a constant reference to the instances object composing this data set object.
Definition: data_set.cpp:220
Matrix< double > arrange_selection_target_data(void) const
Definition: data_set.cpp:909
void set_MPI(const DataSet *)
Definition: data_set.cpp:1308
void convert_angular_variables(void)
Definition: data_set.cpp:7110
Matrix< double > calculate_linear_correlations(void) const
Definition: data_set.cpp:2933
void set_display(const bool &)
Definition: data_set.cpp:1267
Separator separator
Separator character.
Definition: data_set.h:538
Matrix< double > arrange_testing_data(void) const
Definition: data_set.cpp:780
Matrix< double > arrange_testing_input_data(void) const
Definition: data_set.cpp:925
std::string write_first_cell(void) const
Returns a string with the first cell for excel files.
Definition: data_set.cpp:415
Vector< Histogram< double > > calculate_targets_histograms(const size_t &=10) const
Definition: data_set.cpp:2282
void print_data(void) const
Prints to the sceen the values of the data matrix.
Definition: data_set.cpp:4741
void generate_data_multiple_classification(const size_t &, const size_t &)
Definition: data_set.cpp:6813
Vector< Statistics< double > > scale_data_mean_standard_deviation(void)
Definition: data_set.cpp:3282
void save_data(void) const
Saves to the data file the values of the data matrix.
Definition: data_set.cpp:4792
Vector< Statistics< double > > scale_inputs(const std::string &)
Definition: data_set.cpp:3553
Matrix< double > arrange_target_data(void) const
Definition: data_set.cpp:814
std::string write_file_type(void) const
Returns a string with the name of the file type.
Definition: data_set.cpp:375
void read_from_data_file(const Vector< Vector< std::string > > &)
Performs a second data file read in which the data is set.
Definition: data_set.cpp:5362
void unscale_targets_mean_standard_deviation(const Vector< Statistics< double > > &)
Definition: data_set.cpp:3877
DataSet & operator=(const DataSet &)
Definition: data_set.cpp:141
size_t steps_ahead
Number of steps ahead.
Definition: data_set.h:550
const std::string & get_data_file_name(void) const
Returns the name of the data file.
Definition: data_set.cpp:509
Matrix< double > calculate_data_statistics_matrix(void) const
Definition: data_set.cpp:2507
void set_header_line(const bool &)
Sets if the data file contains a header with the names of the variables.
Definition: data_set.cpp:1640
void set_autoassociation(const bool &)
Definition: data_set.cpp:1764
Matrix< Vector< double > > calculate_cross_correlation(void) const
Calculates the cross-correlation between all the variables in the data set.
Definition: data_set.cpp:6688
MissingValues missing_values
Missing values object.
Definition: data_set.h:590
void load_time_series_data_binary(void)
This method loads data from a binary data file for time series prediction methods.
Definition: data_set.cpp:5640
std::string get_trimmed(const std::string &) const
Definition: data_set.cpp:7369
std::string prepend(const std::string &, const std::string &) const
Definition: data_set.cpp:7391
Matrix< double > calculate_instances_distances(const size_t &) const
Definition: data_set.cpp:6297
Vector< double > calculate_training_target_data_mean(void) const
Returns the mean values of the target variables on the training instances.
Definition: data_set.cpp:2883
Instances * get_instances_pointer(void)
Returns a pointer to the variables object composing this data set object.
Definition: data_set.cpp:230
void unscale_targets_minimum_maximum(const Vector< Statistics< double > > &)
Definition: data_set.cpp:3892
size_t lags_number
Number of lags.
Definition: data_set.h:546
std::string write_learning_task(void) const
Returns a string with the name of the project type.
Definition: data_set.cpp:455
ProjectType get_learning_task(void) const
Returns the project type.
Definition: data_set.cpp:445
void load_data_binary(void)
This method loads the data from a binary data file.
Definition: data_set.cpp:5596
bool header_line
Header which contains variables name.
Definition: data_set.h:530
Vector< size_t > unuse_constant_variables(void)
Definition: data_set.cpp:2077
Vector< Statistics< double > > calculate_training_instances_statistics(void) const
Definition: data_set.cpp:2707
Vector< double > calculate_testing_target_data_mean(void) const
Returns the mean values of the target variables on the testing instances.
Definition: data_set.cpp:2915
size_t calculate_selection_negatives(const size_t &) const
Definition: data_set.cpp:2392
const Separator & get_separator(void) const
Returns the separator to be used in the data file.
Definition: data_set.cpp:539
Vector< size_t > balance_binary_targets_distribution(const double &=100.0)
Definition: data_set.cpp:5894
void load(const std::string &)
Definition: data_set.cpp:4718
void convert_time_series(void)
Definition: data_set.cpp:5481
void randomize_data_uniform(const double &minimum=-1.0, const double &maximum=1.0)
Definition: data_set.cpp:3916
Vector< Statistics< double > > scale_data_minimum_maximum(void)
Definition: data_set.cpp:3266
FileType file_type
File type.
Definition: data_set.h:510
void write_XML(tinyxml2::XMLPrinter &) const
Serializes the data set object into a XML document of the TinyXML library without keep the DOM tree i...
Definition: data_set.cpp:4137
Matrix< double > data
Definition: data_set.h:572
Vector< Vector< double > > calculate_data_shape_parameters(void) const
Definition: data_set.cpp:2493
void set_instances_number(const size_t &)
Definition: data_set.cpp:1846
std::string write_last_cell(void) const
Returns a string with the last cell for excel files.
Definition: data_set.cpp:425
Matrix< double > perform_principal_components_analysis(const double &=0.0)
Definition: data_set.cpp:3019
void convert_angular_variables_radians(const Vector< size_t > &)
Definition: data_set.cpp:7063
Variables variables
Variables object (inputs and target variables).
Definition: data_set.h:582
std::string missing_values_label
Missing values label.
Definition: data_set.h:542
const Vector< size_t > & get_angular_variables(void) const
Definition: data_set.cpp:686
Matrix< double > arrange_training_data(void) const
Definition: data_set.cpp:744
void print_data_preview(void) const
Definition: data_set.cpp:4755
void generate_data_binary_classification(const size_t &, const size_t &)
Definition: data_set.cpp:6777
void check_header_line(void)
Definition: data_set.cpp:4946
void sum_binary_inputs(void)
Definition: data_set.cpp:6263
AngularUnits
Enumeration of the units used for angular variables.
Definition: data_set.h:110
Vector< double > calculate_local_outlier_factor(const size_t &=5) const
Definition: data_set.cpp:6466
Matrix< double > calculate_autocorrelation(const size_t &=10) const
Definition: data_set.cpp:6655
void trim(std::string &) const
Definition: data_set.cpp:7353
void set_lags_number(const size_t &)
Definition: data_set.cpp:1738
Vector< Statistics< double > > scale_targets_mean_standard_deviation(void)
Definition: data_set.cpp:3654
Vector< Vector< double > > calculate_box_plots(void) const
Definition: data_set.cpp:2321
const std::string & get_missing_values_label(void) const
Returns the string which will be used as label for the missing values in the data file...
Definition: data_set.cpp:643
Vector< std::string > arrange_time_series_names(const Vector< std::string > &) const
Definition: data_set.cpp:5437
bool is_multiple_classification(void) const
Returns true if the data set is a multiple classification problem, false otherwise.
Definition: data_set.cpp:271
void subtract_variable(const size_t &)
Definition: data_set.cpp:2039
Vector< Statistics< double > > scale_inputs_minimum_maximum(void)
Definition: data_set.cpp:3519
void check_separator(const std::string &) const
Definition: data_set.cpp:4874
bool is_not_numeric(const Vector< std::string > &) const
Definition: data_set.cpp:7425
Vector< std::string > read_header_line(void) const
Returns the name of the columns in the data set as a list of strings.
Definition: data_set.cpp:5022
Vector< std::string > get_tokens(const std::string &) const
Definition: data_set.cpp:7281
std::string last_cell
Last cell.
Definition: data_set.h:518
size_t calculate_testing_negatives(const size_t &) const
Definition: data_set.cpp:2431
bool display
Display messages to screen.
Definition: data_set.h:594
Vector< Statistics< double > > calculate_data_statistics(void) const
Definition: data_set.cpp:2476
Vector< size_t > clean_local_outlier_factor(const size_t &=5)
Definition: data_set.cpp:6496
void convert_angular_variable_radians(const size_t &)
Definition: data_set.cpp:6969
void set_angular_units(AngularUnits &)
Sets the units of the angular variables (Radians or Degrees).
Definition: data_set.cpp:1832
ProjectType
Enumeration of the learning tasks.
Definition: data_set.h:118
const Variables & get_variables(void) const
Returns a constant reference to the variables object composing this data set object.
Definition: data_set.cpp:200
bool empty(void) const
Returns true if the data matrix is empty, and false otherwise.
Definition: data_set.cpp:322
Vector< double > calculate_selection_target_data_mean(void) const
Returns the mean values of the target variables on the selection instances.
Definition: data_set.cpp:2899
Vector< size_t > unuse_most_populated_target(const size_t &)
Definition: data_set.cpp:6062
virtual ~DataSet(void)
Destructor.
Definition: data_set.cpp:130
void set_instance(const size_t &, const Vector< double > &)
Definition: data_set.cpp:1879
void set_separator(const Separator &)
Definition: data_set.cpp:1661
bool is_binary_variable(const size_t &) const
Definition: data_set.cpp:297
Vector< size_t > balance_approximation_targets_distribution(const double &=10.0)
Definition: data_set.cpp:6178
size_t get_column_index(const Vector< Vector< std::string > > &, const size_t) const
Definition: data_set.cpp:4848
void load_data(void)
This method loads the data file.
Definition: data_set.cpp:5517
Vector< Vector< double > > calculate_testing_instances_shape_parameters(void) const
Definition: data_set.cpp:2806
void scrub_missing_values(void)
Definition: data_set.cpp:7191
Vector< Statistics< double > > scale_targets_minimum_maximum(void)
Definition: data_set.cpp:3719
Vector< size_t > arrange_real_inputs_indices(void) const
Returns a vector with the indices of the inputs that are real.
Definition: data_set.cpp:6239
Vector< Statistics< double > > calculate_testing_instances_statistics(void) const
Definition: data_set.cpp:2749
Matrix< double > arrange_used_input_data(void) const
Definition: data_set.cpp:830
void subtract_instance(const size_t &)
Definition: data_set.cpp:1965
Vector< size_t > filter_data(const Vector< double > &, const Vector< double > &)
Definition: data_set.cpp:6845
Matrix< size_t > calculate_nearest_neighbors(const Matrix< double > &, const size_t &) const
Definition: data_set.cpp:6344
Vector< Statistics< double > > calculate_selection_instances_statistics(void) const
Definition: data_set.cpp:2728
const AngularUnits & get_angular_units(void) const
Returns the units used for the angular variables (Radians or Degrees).
Definition: data_set.cpp:696
const Matrix< double > & get_time_series_data(void) const
Definition: data_set.cpp:345
void convert_angular_variables_degrees(const Vector< size_t > &)
Definition: data_set.cpp:7015
void print_summary(void) const
Prints to the screen in text format the main numbers from the data set object.
Definition: data_set.cpp:4662
void set_variables_number(const size_t &)
Definition: data_set.cpp:1863
size_t sheet_number
Sheet number.
Definition: data_set.h:522
Matrix< double > get_instances_submatrix_data(const Vector< size_t > &) const
Definition: data_set.cpp:355
Instances instances
Instances object (training, selection and testing instances).
Definition: data_set.h:586
void save(const std::string &) const
Definition: data_set.cpp:4683
static ScalingUnscalingMethod get_scaling_unscaling_method(const std::string &)
Definition: data_set.cpp:707
std::string write_separator(void) const
Returns the string which will be used as separator in the data file.
Definition: data_set.cpp:596