OpenNN  2.2
Open Neural Networks Library
data_set.cpp
1 /****************************************************************************************************************/
2 /* */
3 /* OpenNN: Open Neural Networks Library */
4 /* www.opennn.net */
5 /* */
6 /* D A T A S E T C L A S S */
7 /* */
8 /* Roberto Lopez */
9 /* Artelnics - Making intelligent use of data */
10 /* robertolopez@artelnics.com */
11 /* */
12 /****************************************************************************************************************/
13 
14 // OpenNN includes
15 
16 #include "data_set.h"
17 
18 
19 namespace OpenNN
20 {
21 
22 // DEFAULT CONSTRUCTOR
23 
26 
28 {
29  set();
30 
31  set_default();
32 }
33 
34 
35 // DATA CONSTRUCTOR
36 
40 
42 {
43  set(data);
44 
45  set_default();
46 }
47 
48 
49 // GENERAL CONSTRUCTOR
50 
57 
58 DataSet::DataSet(const size_t& new_instances_number, const size_t& new_variables_number)
59 {
60  set(new_instances_number, new_variables_number);
61 
62  set_default();
63 }
64 
65 
66 // INSTANCES, INPUTS AND TARGETS NUMBERS CONSTRUCTORS
67 
74 
75 DataSet::DataSet(const size_t& new_instances_number, const size_t& new_inputs_number, const size_t& new_targets_number)
76 {
77  set(new_instances_number, new_inputs_number, new_targets_number);
78 
79  set_default();
80 }
81 
82 
83 // XML CONSTRUCTOR
84 
87 
88 DataSet::DataSet(const tinyxml2::XMLDocument& data_set_document)
89 {
90  set_default();
91 
92  from_XML(data_set_document);
93 }
94 
95 
96 // FILE CONSTRUCTOR
97 
101 
102 DataSet::DataSet(const std::string& file_name)
103 {
104  set();
105 
106  set_default();
107 
108  load(file_name);
109 }
110 
111 
112 // COPY CONSTRUCTOR
113 
117 
118 DataSet::DataSet(const DataSet& other_data_set)
119 {
120  set_default();
121 
122  set(other_data_set);
123 }
124 
125 
126 // DESTRUCTOR
127 
129 
131 {
132 }
133 
134 
135 // ASSIGNMENT OPERATOR
136 
140 
141 DataSet& DataSet::operator = (const DataSet& other_data_set)
142 {
143  if(this != &other_data_set)
144  {
145  data_file_name = other_data_set.data_file_name;
146 
147  // Data matrix
148 
149  data = other_data_set.data;
150 
151  // Variables
152 
153  variables = other_data_set.variables;
154 
155  // Instances
156 
157  instances = other_data_set.instances;
158 
159  // Utilities
160 
161  display = other_data_set.display;
162  }
163 
164  return(*this);
165 }
166 
167 
168 // EQUAL TO OPERATOR
169 
170 // bool operator == (const DataSet&) const method
171 
176 
177 bool DataSet::operator == (const DataSet& other_data_set) const
178 {
179  if(data_file_name == other_data_set.data_file_name
180  && data == other_data_set.data
181  && variables == other_data_set.variables
182  && instances == other_data_set.instances
183  && display == other_data_set.display)
184  {
185  return(true);
186  }
187  else
188  {
189  return(false);
190  }
191 }
192 
193 
194 // METHODS
195 
196 // const Variables& get_variables(void) const
197 
199 
201 {
202  return(variables);
203 }
204 
205 
206 // Variables* get_variables_pointer(void) const
207 
209 
211 {
212  return(&variables);
213 }
214 
215 
216 // const Instances& get_instances(void) const
217 
219 
221 {
222  return(instances);
223 }
224 
225 
226 // Instances* get_instances_pointer(void)
227 
229 
231 {
232  return(&instances);
233 }
234 
235 
236 // const bool& get_display(void) const method
237 
240 
241 const bool& DataSet::get_display(void) const
242 {
243  return(display);
244 }
245 
246 
247 // bool is_binary_classification(void) const method
248 
250 
252 {
254  {
255  return(false);
256  }
257 
259  {
260  return(false);
261  }
262 
263  return(true);
264 }
265 
266 
267 // bool is_multiple_classification(void) const method
268 
270 
272 {
273  const Matrix<double> target_data = arrange_target_data();
274 
275  if(!target_data.is_binary())
276  {
277  return(false);
278  }
279 
280  for(size_t i = 0; i < target_data.get_rows_number(); i++)
281  {
282  if(target_data.arrange_row(i).calculate_sum() == 0.0)
283  {
284  return(false);
285  }
286  }
287 
288  return(true);
289 }
290 
291 
292 // bool is_binary_variable(const size_t&) const method
293 
296 
297 bool DataSet::is_binary_variable(const size_t& variable_index) const
298 {
299  const size_t instances_number = instances.get_instances_number();
300 
301  for(size_t i = 0; i < instances_number; i++)
302  {
303  if(data(i,variable_index) == 0.0 || data(i,variable_index) == 1.0)
304  {
305  continue;
306  }
307  else
308  {
309  return false;
310  }
311 
312  }
313 
314  return true;
315 }
316 
317 
318 // bool empty(void) const method
319 
321 
322 bool DataSet::empty(void) const
323 {
324  return(data.empty());
325 }
326 
327 
328 // const Matrix<double>& get_data(void) const method
329 
333 
335 {
336  return(data);
337 }
338 
339 
340 // const Matrix<double>& get_time_series_data(void) const method
341 
344 
346 {
347  return(time_series_data);
348 }
349 
350 // Matrix<double> get_instances_submatrix_data(const Vector<size_t>&) const method
351 
354 
356 {
357  return(data.arrange_submatrix_rows(instances_indices));
358 }
359 
360 
361 // FileType get_file_type(void) const method
362 
364 
366 {
367  return(file_type);
368 }
369 
370 
371 // std::string write_file_type(void) const
372 
374 
375 std::string DataSet::write_file_type(void) const
376 {
377  switch(file_type)
378  {
379  case TXT:
380  return "txt";
381 
382  case DAT:
383  return "dat";
384 
385  case CSV:
386  return "csv";
387 
388  case ODS:
389  return "ods";
390 
391  case XLSX:
392  return "xlsx";
393 
394  case ARFF:
395  return "arff";
396 
397  default:
398  {
399  std::ostringstream buffer;
400 
401  buffer << "OpenNN Exception: DataSet class.\n"
402  << "std::string write_file_type(void) const method.\n"
403  << "Unknown file type.\n";
404 
405  throw std::logic_error(buffer.str());
406  }
407  }
408 }
409 
410 
411 // std::string write_first_cell(void) const
412 
414 
415 std::string DataSet::write_first_cell(void) const
416 {
417  return first_cell;
418 }
419 
420 
421 // std::string write_last_cell(void) const
422 
424 
425 std::string DataSet::write_last_cell(void) const
426 {
427  return last_cell;
428 }
429 
430 
431 // int write_sheet_number(void) const
432 
434 
435 size_t DataSet::write_sheet_number(void) const
436 {
437  return sheet_number;
438 }
439 
440 
441 // ProjectType get_learning_task(void) const
442 
444 
446 {
447  return learning_task;
448 }
449 
450 
451 // string write_learning_task(void) const
452 
454 
455 std::string DataSet::write_learning_task(void) const
456 {
457  switch(learning_task)
458  {
459  case Approximation:
460  return "Approximation";
461 
462  case Classification:
463  return "Classification";
464 
465  case Forecasting:
466  return "Forecasting";
467 
468  case Association:
469  return "Association";
470 
471  default:
472  {
473  std::ostringstream buffer;
474 
475  buffer << "OpenNN Exception: DataSet class.\n"
476  << "std::string write_learning_task(void) const method.\n"
477  << "Unknown project type.\n";
478 
479  throw std::logic_error(buffer.str());
480  }
481  }
482 }
483 
484 
485 // const MissingValues& get_missing_values(void) const method
486 
488 
490 {
491  return(missing_values);
492 }
493 
494 
495 // MissingValues* get_missing_values_pointer(void) method
496 
498 
500 {
501  return(&missing_values);
502 }
503 
504 
505 // const std::string& get_data_file_name(void) const method
506 
508 
509 const std::string& DataSet::get_data_file_name(void) const
510 {
511  return(data_file_name);
512 }
513 
514 
515 // const bool& get_header(void) const
516 
518 
519 const bool& DataSet::get_header_line(void) const
520 {
521  return(header_line);
522 }
523 
524 
525 // const bool& get_rows_label(void) const
526 
528 
529 const bool& DataSet::get_rows_label(void) const
530 {
531  return(rows_label);
532 }
533 
534 
535 // const Separator& get_separator(void) const
536 
538 
540 {
541  return(separator);
542 }
543 
544 
545 // std::string get_separator_string(void) const
546 
548 
549 std::string DataSet::get_separator_string(void) const
550 {
551  switch(separator)
552  {
553  case Space:
554  {
555  return(" ");
556  }
557  break;
558 
559  case Tab:
560  {
561  return("\t");
562  }
563  break;
564 
565  case Comma:
566  {
567  return(",");
568  }
569  break;
570 
571  case Semicolon:
572  {
573  return(";");
574  }
575  break;
576 
577  default:
578  {
579  std::ostringstream buffer;
580 
581  buffer << "OpenNN Exception: DataSet class.\n"
582  << "std::string get_separator_string(void) const method.\n"
583  << "Unknown separator.\n";
584 
585  throw std::logic_error(buffer.str());
586  }
587  break;
588  }
589 }
590 
591 
592 // std::string write_separator(void) const
593 
595 
596 std::string DataSet::write_separator(void) const
597 {
598  switch(separator)
599  {
600  case Space:
601  {
602  return("Space");
603  }
604  break;
605 
606  case Tab:
607  {
608  return("Tab");
609  }
610  break;
611 
612  case Comma:
613  {
614  return("Comma");
615  }
616  break;
617 
618  case Semicolon:
619  {
620  return("Semicolon");
621  }
622  break;
623 
624  default:
625  {
626  std::ostringstream buffer;
627 
628  buffer << "OpenNN Exception: DataSet class.\n"
629  << "std::string write_separator(void) const method.\n"
630  << "Unknown separator.\n";
631 
632  throw std::logic_error(buffer.str());
633  }
634  break;
635  }
636 }
637 
638 
639 // const std::string& get_missing_values_label(void) const
640 
642 
643 const std::string& DataSet::get_missing_values_label(void) const
644 {
645  return(missing_values_label);
646 }
647 
648 
649 // const size_t& get_lags_number(void) const
650 
652 
653 const size_t& DataSet::get_lags_number(void) const
654 {
655  return(lags_number);
656 }
657 
658 
659 // const size_t& get_steps_ahead(void) const
660 
662 
663 const size_t& DataSet::get_steps_ahead(void) const
664 {
665  return(steps_ahead);
666 }
667 
668 
669 // const bool& get_autoassociation(void) const
670 
673 
674 const bool& DataSet::get_autoassociation(void) const
675 {
676  return(association);
677 }
678 
679 
680 // const Vector<size_t>& get_angular_variables(void) const method
681 
685 
687 {
688  return(angular_variables);
689 }
690 
691 
692 // const AngularUnits& get_angular_units(void) const method
693 
695 
697 {
698  return(angular_units);
699 }
700 
701 
702 // static ScalingUnscalingMethod get_scaling_unscaling_method(const std::string&) method
703 
706 
707 DataSet::ScalingUnscalingMethod DataSet::get_scaling_unscaling_method(const std::string& scaling_unscaling_method)
708 {
709  if(scaling_unscaling_method == "NoScaling")
710  {
711  return(NoScaling);
712  }
713  else if(scaling_unscaling_method == "NoUnscaling")
714  {
715  return(NoUnscaling);
716  }
717  else if(scaling_unscaling_method == "MinimumMaximum")
718  {
719  return(MinimumMaximum);
720  }
721  else if(scaling_unscaling_method == "MeanStandardDeviation")
722  {
723  return(MeanStandardDeviation);
724  }
725  else
726  {
727  std::ostringstream buffer;
728 
729  buffer << "OpenNN Exception: DataSet class.\n"
730  << "static ScalingUnscalingMethod get_scaling_unscaling_method(const std::string).\n"
731  << "Unknown scaling-unscaling method: " << scaling_unscaling_method << ".\n";
732 
733  throw std::logic_error(buffer.str());
734  }
735 }
736 
737 
738 // Matrix<double> arrange_training_data(void) const method
739 
743 
745 {
746  const size_t variables_number = variables.get_variables_number();
747 
748  Vector<size_t> variables_indices(0, 1, (int)variables_number-1);
749 
750  const Vector<size_t> training_indices = instances.arrange_training_indices();
751 
752  return(data.arrange_submatrix(training_indices, variables_indices));
753 }
754 
755 
756 // Matrix<double> arrange_selection_data(void) const method
757 
761 
763 {
764  const size_t variables_number = variables.get_variables_number();
765 
766  const Vector<size_t> selection_indices = instances.arrange_selection_indices();
767 
768  Vector<size_t> variables_indices(0, 1, (int)variables_number-1);
769 
770  return(data.arrange_submatrix(selection_indices, variables_indices));
771 }
772 
773 
774 // Matrix<double> arrange_testing_data(void) const method
775 
779 
781 {
782  const size_t variables_number = variables.get_variables_number();
783  Vector<size_t> variables_indices(0, 1, (int)variables_number-1);
784 
785  const Vector<size_t> testing_indices = instances.arrange_testing_indices();
786 
787  return(data.arrange_submatrix(testing_indices, variables_indices));
788 }
789 
790 
791 // Matrix<double> arrange_input_data(void) const method
792 
796 
798 {
799  const size_t instances_number = instances.get_instances_number();
800  Vector<size_t> indices(0, 1, (size_t)instances_number-1);
801 
802  const Vector<size_t> input_indices = variables.arrange_inputs_indices();
803 
804  return(data.arrange_submatrix(indices, input_indices));
805 }
806 
807 
808 // Matrix<double> arrange_target_data(void) const method
809 
813 
815 {
816  const size_t instances_number = instances.get_instances_number();
817  Vector<size_t> indices(0, 1, (size_t)instances_number-1);
818 
819  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
820 
821  return(data.arrange_submatrix(indices, targets_indices));
822 }
823 
824 // Matrix<double> arrange_used_input_data(void) const method
825 
829 
831 {
833 
834  const Vector<size_t> input_indices = variables.arrange_inputs_indices();
835 
836  return(data.arrange_submatrix(indices, input_indices));
837 }
838 
839 
840 // Matrix<double> arrange_used_target_data(void) const method
841 
845 
847 {
849 
850  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
851 
852  return(data.arrange_submatrix(indices, targets_indices));
853 }
854 
855 // Matrix<double> arrange_training_input_data(void) const method
856 
860 
862 {
863  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
864 
865  const Vector<size_t> training_indices = instances.arrange_training_indices();
866 
867  return(data.arrange_submatrix(training_indices, inputs_indices));
868 }
869 
870 
871 // Matrix<double> arrange_training_target_data(void) const method
872 
876 
878 {
879  const Vector<size_t> training_indices = instances.arrange_training_indices();
880 
881  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
882 
883  return(data.arrange_submatrix(training_indices, targets_indices));
884 }
885 
886 
887 // Matrix<double> arrange_selection_input_data(void) const method
888 
892 
894 {
895  const Vector<size_t> selection_indices = instances.arrange_selection_indices();
896 
897  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
898 
899  return(data.arrange_submatrix(selection_indices, inputs_indices));
900 }
901 
902 
903 // Matrix<double> arrange_selection_target_data(void) const method
904 
908 
910 {
911  const Vector<size_t> selection_indices = instances.arrange_selection_indices();
912 
913  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
914 
915  return(data.arrange_submatrix(selection_indices, targets_indices));
916 }
917 
918 
919 // Matrix<double> arrange_testing_input_data(void) const method
920 
924 
926 {
927  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
928 
929  const Vector<size_t> testing_indices = instances.arrange_testing_indices();
930 
931  return(data.arrange_submatrix(testing_indices, inputs_indices));
932 }
933 
934 
935 // Matrix<double> arrange_testing_target_data(void) const method
936 
940 
942 {
943  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
944 
945  const Vector<size_t> testing_indices = instances.arrange_testing_indices();
946 
947  return(data.arrange_submatrix(testing_indices, targets_indices));
948 }
949 
950 
951 // Vector<double> get_instance(const size_t&) const method
952 
955 
956 Vector<double> DataSet::get_instance(const size_t& i) const
957 {
958  // Control sentence (if debug)
959 
960  #ifdef __OPENNN_DEBUG__
961 
962  const size_t instances_number = instances.get_instances_number();
963 
964  if(i >= instances_number)
965  {
966  std::ostringstream buffer;
967 
968  buffer << "OpenNN Exception: DataSet class.\n"
969  << "Vector<double> get_instance(const size_t&) const method.\n"
970  << "Index of instance must be less than number of instances.\n";
971 
972  throw std::logic_error(buffer.str());
973  }
974 
975  #endif
976 
977  // Get instance
978 
979  return(data.arrange_row(i));
980 }
981 
982 
983 // Vector<double> get_instance(const size_t&, const Vector<size_t>&) const method
984 
988 
989 Vector<double> DataSet::get_instance(const size_t& instance_index, const Vector<size_t>& variables_indices) const
990 {
991  // Control sentence (if debug)
992 
993  #ifdef __OPENNN_DEBUG__
994 
995  const size_t instances_number = instances.get_instances_number();
996 
997  if(instance_index >= instances_number)
998  {
999  std::ostringstream buffer;
1000 
1001  buffer << "OpenNN Exception: DataSet class.\n"
1002  << "Vector<double> get_instance(const size_t&, const Vector<size_t>&) const method.\n"
1003  << "Index of instance must be less than number of instances.\n";
1004 
1005  throw std::logic_error(buffer.str());
1006  }
1007 
1008  #endif
1009 
1010  // Get instance
1011 
1012  return(data.arrange_row(instance_index, variables_indices));
1013 }
1014 
1015 
1016 // Vector<double> get_variable(const size_t&) const method
1017 
1020 
1022 {
1023  // Control sentence (if debug)
1024 
1025  #ifdef __OPENNN_DEBUG__
1026 
1027  const size_t variables_number = variables.get_variables_number();
1028 
1029  if(i >= variables_number)
1030  {
1031  std::ostringstream buffer;
1032 
1033  buffer << "OpenNN Exception: DataSet class.\n"
1034  << "Vector<double> get_variable(const size_t&) const method.\n"
1035  << "Index of variable must be less than number of instances.\n";
1036 
1037  throw std::logic_error(buffer.str());
1038  }
1039 
1040  #endif
1041 
1042  // Get variable
1043 
1044  return(data.arrange_column(i));
1045 }
1046 
1047 
1048 // Vector<double> get_variable(const size_t&, const Vector<size_t>&) const method
1049 
1053 
1054 Vector<double> DataSet::get_variable(const size_t& variable_index, const Vector<size_t>& instances_indices) const
1055 {
1056  // Control sentence (if debug)
1057 
1058  #ifdef __OPENNN_DEBUG__
1059 
1060  const size_t variables_number = variables.get_variables_number();
1061 
1062  if(variable_index >= variables_number)
1063  {
1064  std::ostringstream buffer;
1065 
1066  buffer << "OpenNN Exception: DataSet class.\n"
1067  << "Vector<double> get_variable(const size_t&, const Vector<double>&) const method.\n"
1068  << "Index of variable must be less than number of instances.\n";
1069 
1070  throw std::logic_error(buffer.str());
1071  }
1072 
1073  #endif
1074 
1075  // Get variable
1076 
1077  return(data.arrange_column(variable_index, instances_indices));
1078 }
1079 
1080 
1081 // void set(void) method
1082 
1084 
1085 void DataSet::set(void)
1086 {
1087  data_file_name = "";
1088 
1089  first_cell = "";
1090  last_cell = "";
1091 
1092  data.set();
1093 
1094  variables.set();
1095  instances.set();
1096 
1097  missing_values.set();
1098 
1099  display = true;
1100 
1101  file_type = DAT;
1102 }
1103 
1104 
1105 // void set(const Matrix<double>&) method
1106 
1109 
1110 void DataSet::set(const Matrix<double>& new_data)
1111 {
1112  data_file_name = "";
1113 
1114  const size_t variables_number = new_data.get_columns_number();
1115  const size_t instances_number = new_data.get_rows_number();
1116 
1117  set(instances_number, variables_number);
1118 
1119  data = new_data;
1120 
1121  display = true;
1122 
1123  file_type = DAT;
1124 }
1125 
1126 
1127 // void set(const size_t&, const size_t&) method
1128 
1134 
1135 void DataSet::set(const size_t& new_instances_number, const size_t& new_variables_number)
1136 {
1137  // Control sentence (if debug)
1138 
1139  #ifdef __OPENNN_DEBUG__
1140 
1141  if(new_instances_number == 0)
1142  {
1143  std::ostringstream buffer;
1144 
1145  buffer << "OpenNN Exception: DataSet class.\n"
1146  << "void set(const size_t&, const size_t&) method.\n"
1147  << "Number of instances must be greater than zero.\n";
1148 
1149  throw std::logic_error(buffer.str());
1150  }
1151 
1152  if(new_variables_number == 0)
1153  {
1154  std::ostringstream buffer;
1155 
1156  buffer << "OpenNN Exception: DataSet class.\n"
1157  << "void set(const size_t&, const size_t&) method.\n"
1158  << "Number of variables must be greater than zero.\n";
1159 
1160  throw std::logic_error(buffer.str());
1161  }
1162 
1163  #endif
1164 
1165  data.set(new_instances_number, new_variables_number);
1166 
1167  instances.set(new_instances_number);
1168 
1169  variables.set(new_variables_number);
1170 
1171  missing_values.set(new_instances_number, new_variables_number);
1172 
1173  display = true;
1174 
1175  file_type = DAT;
1176 }
1177 
1178 
1179 // void set(const size_t&, const size_t&, const size_t&) method
1180 
1186 
1187 void DataSet::set(const size_t& new_instances_number, const size_t& new_inputs_number, const size_t& new_targets_number)
1188 {
1189  data_file_name = "";
1190 
1191  const size_t new_variables_number = new_inputs_number + new_targets_number;
1192 
1193  data.set(new_instances_number, new_variables_number);
1194 
1195  variables.set(new_inputs_number, new_targets_number);
1196 
1197  instances.set(new_instances_number);
1198 
1199  missing_values.set(new_instances_number, new_variables_number);
1200 
1201  display = true;
1202 
1203  file_type = DAT;
1204 }
1205 
1206 
1207 // void set(const DataSet& other_data_set)
1208 
1211 
1212 void DataSet::set(const DataSet& other_data_set)
1213 {
1214  data_file_name = other_data_set.data_file_name;
1215 
1216  header_line = other_data_set.header_line;
1217 
1218  separator = other_data_set.separator;
1219 
1220  missing_values_label = other_data_set.missing_values_label;
1221 
1222  data = other_data_set.data;
1223 
1224  variables = other_data_set.variables;
1225 
1226  instances = other_data_set.instances;
1227 
1228  missing_values = other_data_set.missing_values;
1229 
1230  display = other_data_set.display;
1231 
1232  file_type = other_data_set.file_type;
1233 }
1234 
1235 
1236 // void set(const tinyxml2::XMLDocument&) method
1237 
1240 
1241 void DataSet::set(const tinyxml2::XMLDocument& data_set_document)
1242 {
1243  set_default();
1244 
1245  from_XML(data_set_document);
1246 }
1247 
1248 
1249 // void set(const std::string&) method
1250 
1253 
1254 void DataSet::set(const std::string& file_name)
1255 {
1256  load(file_name);
1257 }
1258 
1259 
1260 // void set_display(const bool&) method
1261 
1266 
1267 void DataSet::set_display(const bool& new_display)
1268 {
1269  display = new_display;
1270 }
1271 
1272 
1273 // void set_default(void) method
1274 
1279 
1281 {
1282  header_line = false;
1283 
1284  separator = Space;
1285 
1286  missing_values_label = "?";
1287 
1288  lags_number = 0;
1289 
1290  steps_ahead = 0;
1291 
1292  association = false;
1293 
1294  angular_units = Degrees;
1295 
1296  display = true;
1297 
1298  file_type = DAT;
1299 
1300  sheet_number = 1;
1301 }
1302 
1303 // void set_MPI(const DataSet*) method
1304 
1307 
1308 void DataSet::set_MPI(const DataSet* data_set)
1309 {
1310 #ifdef __OPENNN_MPI__
1311 
1312  int size;
1313  MPI_Comm_size(MPI_COMM_WORLD, &size);
1314 
1315  int rank;
1316  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
1317 
1318  int training_instances_number;
1319  int selection_instances_number;
1320 
1321  int inputs_number;
1322  int outputs_number;
1323 
1324  Vector<int> inputs_indices;
1325  Vector<int> targets_indices;
1326 
1327  Vector<size_t> training_indices;
1328  Vector<size_t> selection_indices;
1329 
1330  if(rank == 0)
1331  {
1332  // Variables to send initialization
1333 
1334  const Instances& instances = data_set->get_instances();
1335 
1336  training_instances_number = (int)instances.count_training_instances_number();
1337  selection_instances_number = (int)instances.count_selection_instances_number();
1338 
1339  training_indices = instances.arrange_training_indices();
1340  selection_indices = instances.arrange_selection_indices();
1341 
1342  const Variables& variables = data_set->get_variables();
1343 
1344  inputs_indices = variables.arrange_inputs_indices_int();
1345  targets_indices = variables.arrange_targets_indices_int();
1346 
1347  inputs_number = (int)variables.count_inputs_number();
1348  outputs_number = (int)variables.count_targets_number();
1349  }
1350 
1351  // Send variables
1352 
1353  MPI_Barrier(MPI_COMM_WORLD);
1354 
1355  if(rank > 0)
1356  {
1357  MPI_Request req[4];
1358 
1359  MPI_Irecv(&inputs_number, 1, MPI_INT, rank-1, 1, MPI_COMM_WORLD, &req[0]);
1360  MPI_Irecv(&outputs_number, 1, MPI_INT, rank-1, 2, MPI_COMM_WORLD, &req[1]);
1361 
1362  MPI_Waitall(2, req, MPI_STATUS_IGNORE);
1363 
1364  inputs_indices.set(inputs_number);
1365  targets_indices.set(outputs_number);
1366 
1367  MPI_Irecv(&training_instances_number, 1, MPI_INT, rank-1, 5, MPI_COMM_WORLD, &req[0]);
1368  MPI_Irecv(&selection_instances_number, 1, MPI_INT, rank-1, 6, MPI_COMM_WORLD, &req[1]);
1369 
1370  MPI_Irecv(inputs_indices.data(), (int)inputs_number, MPI_INT, rank-1, 7, MPI_COMM_WORLD, &req[2]);
1371  MPI_Irecv(targets_indices.data(), (int)outputs_number, MPI_INT, rank-1, 8, MPI_COMM_WORLD, &req[3]);
1372 
1373  MPI_Waitall(4, req, MPI_STATUS_IGNORE);
1374  }
1375 
1376  if(rank < size-1)
1377  {
1378  MPI_Request req[6];
1379 
1380  MPI_Isend(&inputs_number, 1, MPI_INT, rank+1, 1, MPI_COMM_WORLD, &req[0]);
1381  MPI_Isend(&outputs_number, 1, MPI_INT, rank+1, 2, MPI_COMM_WORLD, &req[1]);
1382 
1383  MPI_Isend(&training_instances_number, 1, MPI_INT, rank+1, 5, MPI_COMM_WORLD, &req[2]);
1384  MPI_Isend(&selection_instances_number, 1, MPI_INT, rank+1, 6, MPI_COMM_WORLD, &req[3]);
1385 
1386  MPI_Isend(inputs_indices.data(), (int)inputs_number, MPI_INT, rank+1, 7, MPI_COMM_WORLD, &req[4]);
1387  MPI_Isend(targets_indices.data(), (int)outputs_number, MPI_INT, rank+1, 8, MPI_COMM_WORLD, &req[5]);
1388 
1389  MPI_Waitall(6, req, MPI_STATUS_IGNORE);
1390  }
1391 
1392  size = std::min(size,(int)training_instances_number);
1393 
1394  // Get the group of processes in MPI_COMM_WORLD
1395  MPI_Group world_group;
1396  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
1397 
1398  int* ranks = (int*)malloc(size*sizeof(int));
1399 
1400  for(int i = 0; i < size; i++)
1401  {
1402  ranks[i] = i;
1403  }
1404 
1405  // Construct a group containing all of the prime ranks in world_group
1406  MPI_Group error_group;
1407  MPI_Group_incl(world_group, size, ranks, &error_group);
1408 
1409  // Create a new communicator based on the group
1410  MPI_Comm error_comm;
1411  MPI_Comm_create(MPI_COMM_WORLD, error_group, &error_comm);
1412 
1413  int i = 0;
1414 
1415  Vector<size_t> training_instances_per_processor(size);
1416  Vector<size_t> selection_instances_per_processor(size);
1417 
1418  for(i = 0; i < size; i++)
1419  {
1420  training_instances_per_processor[i] = (size_t)(training_instances_number/size);
1421  selection_instances_per_processor[i] = (size_t)(selection_instances_number/size);
1422 
1423  if(i < training_instances_number%size)
1424  {
1425  training_instances_per_processor[i]++;
1426  }
1427  if(i < selection_instances_number%size)
1428  {
1429  selection_instances_per_processor[i]++;
1430  }
1431  }
1432 
1433  Matrix<double> processor_data;
1434 
1435  // Append training instances
1436 
1437  if(rank != 0 && rank < size)
1438  {
1439  processor_data.set(inputs_number+outputs_number, training_instances_per_processor[rank]+selection_instances_per_processor[rank]);
1440 
1441  for(int j = 0; j < training_instances_per_processor[rank]; j++)
1442  {
1443  MPI_Recv(processor_data.data()+(j*(inputs_number+outputs_number)), (int)(inputs_number+outputs_number), MPI_DOUBLE, 0, j, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
1444  }
1445 
1446  for(int j = 0; j < selection_instances_per_processor[rank]; j++)
1447  {
1448  MPI_Recv(processor_data.data()+((j+training_instances_per_processor[rank])*(inputs_number+outputs_number)),
1449  (int)(inputs_number+outputs_number), MPI_DOUBLE, 0, j+(int)training_instances_per_processor[rank], MPI_COMM_WORLD, MPI_STATUS_IGNORE);
1450  }
1451 
1452  processor_data = processor_data.calculate_transpose();
1453  }
1454  else if(rank == 0)
1455  {
1456  size_t training_instances_sent = training_instances_per_processor[rank];
1457  size_t selection_instances_sent = selection_instances_per_processor[rank];
1458 
1459  for(i = 1; i < size; i++)
1460  {
1461  for(int j = 0; j < training_instances_per_processor[i]; j++)
1462  {
1463  const Vector<double> instance_to_send = data_set->get_instance(training_indices[training_instances_sent]);
1464 
1465  MPI_Send(instance_to_send.data(), (int)(inputs_number+outputs_number), MPI_DOUBLE, i, j, MPI_COMM_WORLD);
1466 
1467  training_instances_sent++;
1468  }
1469  for(int j = 0; j < selection_instances_per_processor[i]; j++)
1470  {
1471  const Vector<double> instance_to_send = data_set->get_instance(selection_indices[selection_instances_sent]);
1472 
1473  MPI_Send(instance_to_send.data(), (int)(inputs_number+outputs_number), MPI_DOUBLE, i, j+(int)training_instances_per_processor[i], MPI_COMM_WORLD);
1474 
1475  selection_instances_sent++;
1476  }
1477  }
1478 
1479  processor_data.set(training_instances_per_processor[rank]+selection_instances_per_processor[rank],inputs_number+outputs_number);
1480 
1481  for(i = 0; i < training_instances_per_processor[rank]; i++)
1482  {
1483  const Vector<double> instance_to_append = data_set->get_instance(training_indices[i]);
1484 
1485  processor_data.set_row(i, instance_to_append);
1486  }
1487  for(i = 0; i < selection_instances_per_processor[rank]; i++)
1488  {
1489  const Vector<double> instance_to_append = data_set->get_instance(selection_indices[i]);
1490 
1491  processor_data.set_row(i+training_instances_per_processor[rank], instance_to_append);
1492  }
1493  }
1494 
1495  MPI_Barrier(MPI_COMM_WORLD);
1496 
1497  if(rank < size)
1498  {
1499  set_data(processor_data);
1501  get_variables_pointer()->set_input_indices(inputs_indices);
1502  get_variables_pointer()->set_target_indices(targets_indices);
1503  get_instances_pointer()->split_sequential_indices((double)training_instances_per_processor[rank], (double)selection_instances_per_processor[rank], 0.0);
1504  }
1505 
1506  free(ranks);
1507 
1508 #else
1509 
1510  set(*data_set);
1511 
1512 #endif
1513 }
1514 
1515 
1516 // void set_data(const Matrix<double>&) method
1517 
1523 
1524 void DataSet::set_data(const Matrix<double>& new_data)
1525 {
1526  // Control sentence (if debug)
1527 
1528  #ifdef __OPENNN_DEBUG__
1529 
1530  const size_t rows_number = new_data.get_rows_number();
1531  const size_t instances_number = instances.get_instances_number();
1532 
1533  if(rows_number != instances_number)
1534  {
1535  std::ostringstream buffer;
1536 
1537  buffer << "OpenNN Exception: DataSet class.\n"
1538  << "void set_data(const Matrix<double>&) method.\n"
1539  << "Number of rows (" << rows_number << ") must be equal to number of instances (" << instances_number << ").\n";
1540 
1541  throw std::logic_error(buffer.str());
1542  }
1543 
1544  const size_t columns_number = new_data.get_columns_number();
1545  const size_t variables_number = variables.get_variables_number();
1546 
1547  if(columns_number != variables_number)
1548  {
1549  std::ostringstream buffer;
1550 
1551  buffer << "OpenNN Exception: DataSet class.\n"
1552  << "void set_data(const Matrix<double>&) method.\n"
1553  << "Number of columns (" << columns_number << ") must be equal to number of variables (" << variables_number << ").\n";
1554 
1555  throw std::logic_error(buffer.str());
1556  }
1557 
1558  #endif
1559 
1560  // Set data
1561 
1562  data = new_data;
1563 
1564  instances.set_instances_number(data.get_rows_number());
1565  variables.set_variables_number(data.get_columns_number());
1566 
1567 }
1568 
1569 
1570 // void set_data_file_name(const std::string&) method
1571 
1576 
1577 void DataSet::set_data_file_name(const std::string& new_data_file_name)
1578 {
1579  data_file_name = new_data_file_name;
1580 }
1581 
1582 
1583 // void set_file_type(const FileType&) method
1584 
1586 
1587 void DataSet::set_file_type(const DataSet::FileType& new_file_type)
1588 {
1589  file_type = new_file_type;
1590 }
1591 
1592 
1593 // void set_file_type(const std::string&) method
1594 
1596 
1597 void DataSet::set_file_type(const std::string& new_file_type)
1598 {
1599  if(new_file_type == "txt")
1600  {
1601  file_type = TXT;
1602  }
1603  else if(new_file_type == "dat")
1604  {
1605  file_type = DAT;
1606  }
1607  else if(new_file_type == "csv")
1608  {
1609  file_type = CSV;
1610  }
1611  else if(new_file_type == "ods")
1612  {
1613  file_type = ODS;
1614  }
1615  else if(new_file_type == "xlsx")
1616  {
1617  file_type = XLSX;
1618  }
1619  else if(new_file_type == "arff")
1620  {
1621  file_type = ARFF;
1622  }
1623  else
1624  {
1625  std::ostringstream buffer;
1626 
1627  buffer << "OpenNN Exception: DataSet class.\n"
1628  << "void set_file_type(const std::string&) method.\n"
1629  << "Unknown file type.";
1630 
1631  throw std::logic_error(buffer.str());
1632  }
1633 }
1634 
1635 
1636 // void set_header_line(const bool&) method
1637 
1639 
1640 void DataSet::set_header_line(const bool& new_header_line)
1641 {
1642  header_line = new_header_line;
1643 }
1644 
1645 
1646 // void set_rows_label(const bool&) method
1647 
1649 
1650 void DataSet::set_rows_label(const bool& new_rows_label)
1651 {
1652  rows_label = new_rows_label;
1653 }
1654 
1655 
1656 // void set_separator(const Separator&) method
1657 
1660 
1661 void DataSet::set_separator(const Separator& new_separator)
1662 {
1663  separator = new_separator;
1664 }
1665 
1666 
1667 // void set_separator(const std::string&) method
1668 
1671 
1672 void DataSet::set_separator(const std::string& new_separator)
1673 {
1674  if(new_separator == "Space")
1675  {
1676  separator = Space;
1677  }
1678  else if(new_separator == "Tab")
1679  {
1680  separator = Tab;
1681  }
1682  else if(new_separator == "Comma")
1683  {
1684  separator = Comma;
1685  }
1686  else if(new_separator == "Semicolon")
1687  {
1688  separator = Semicolon;
1689  }
1690  else
1691  {
1692  std::ostringstream buffer;
1693 
1694  buffer << "OpenNN Exception: DataSet class.\n"
1695  << "void set_separator(const std::string&) method.\n"
1696  << "Unknown separator: " << new_separator << ".\n";
1697 
1698  throw std::logic_error(buffer.str());
1699  }
1700 }
1701 
1702 
1703 // void set_missing_values_label(const std::string&) method
1704 
1707 
1708 void DataSet::set_missing_values_label(const std::string& new_missing_values_label)
1709 {
1710  // Control sentence (if debug)
1711 
1712  #ifdef __OPENNN_DEBUG__
1713 
1714  if(get_trimmed(new_missing_values_label).empty())
1715  {
1716  std::ostringstream buffer;
1717 
1718  buffer << "OpenNN Exception: DataSet class.\n"
1719  << "void set_missing_values_label(const std::string&) method.\n"
1720  << "Missing values label cannot be empty.\n";
1721 
1722  throw std::logic_error(buffer.str());
1723  }
1724 
1725  #endif
1726 
1727 
1728  missing_values_label = new_missing_values_label;
1729 }
1730 
1731 
1732 // void set_lags_number(const size_t&)
1733 
1737 
1738 void DataSet::set_lags_number(const size_t& new_lags_number)
1739 {
1740  lags_number = new_lags_number;
1741 }
1742 
1743 
1744 // void set_steps_ahead_number(const size_t&)
1745 
1749 
1750 void DataSet::set_steps_ahead_number(const size_t& new_steps_ahead_number)
1751 {
1752  steps_ahead = new_steps_ahead_number;
1753 }
1754 
1755 
1756 // void set_autoassociation(const size_t&)
1757 
1763 
1764 void DataSet::set_autoassociation(const bool& new_autoassociation)
1765 {
1766  association = new_autoassociation;
1767 }
1768 
1769 
1770 // void set_learning_task(const ProjectType&)
1771 
1774 
1775 void DataSet::set_learning_task(const DataSet::ProjectType& new_learning_task)
1776 {
1777  learning_task = new_learning_task;
1778 }
1779 
1780 
1781 // void set_learning_task(const std::string&)
1782 
1785 
1786 void DataSet::set_learning_task(const std::string& new_learning_task)
1787 {
1788  if(new_learning_task == "Approximation")
1789  {
1790  learning_task = Approximation;
1791  }
1792  else if(new_learning_task == "Classification")
1793  {
1794  learning_task = Classification;
1795  }
1796  else if(new_learning_task == "Forecasting")
1797  {
1798  learning_task = Forecasting;
1799  }
1800  else if(new_learning_task == "Association")
1801  {
1802  learning_task = Association;
1803  }
1804  else
1805  {
1806  std::ostringstream buffer;
1807 
1808  buffer << "OpenNN Exception: DataSet class.\n"
1809  << "void set_learning_task(const std::string&) method.\n"
1810  << "Not known project type.\n";
1811 
1812  throw std::logic_error(buffer.str());
1813  }
1814 }
1815 
1816 
1817 // void set_angular_variables(const Vector<size_t>&)
1818 
1821 
1822 void DataSet::set_angular_variables(const Vector<size_t>& new_angular_variables)
1823 {
1824  angular_variables = new_angular_variables;
1825 }
1826 
1827 
1828 // void set_angular_units(AngularUnits&)
1829 
1831 
1833 {
1834  angular_units = new_angular_units;
1835 }
1836 
1837 
1838 
1839 // void set_instances_number(const size_t&) method
1840 
1845 
1846 void DataSet::set_instances_number(const size_t& new_instances_number)
1847 {
1848  const size_t variables_number = variables.get_variables_number();
1849 
1850  data.set(new_instances_number, variables_number);
1851 
1852  instances.set(new_instances_number);
1853 }
1854 
1855 
1856 // void set_variables_number(const size_t&) method
1857 
1862 
1863 void DataSet::set_variables_number(const size_t& new_variables_number)
1864 {
1865  const size_t instances_number = instances.get_instances_number();
1866 
1867  data.set(instances_number, new_variables_number);
1868 
1869  variables.set(new_variables_number);
1870 }
1871 
1872 
1873 // void set_instance(const size_t&, const Vector<double>&)
1874 
1878 
1879 void DataSet::set_instance(const size_t& instance_index, const Vector<double>& instance)
1880 {
1881  // Control sentence (if debug)
1882 
1883  #ifdef __OPENNN_DEBUG__
1884 
1885  const size_t instances_number = instances.get_instances_number();
1886 
1887  if(instance_index >= instances_number)
1888  {
1889  std::ostringstream buffer;
1890 
1891  buffer << "OpenNN Exception: DataSet class.\n"
1892  << "void set_instance(const size_t&, const Vector<double>&) method.\n"
1893  << "Index of instance must be less than number of instances.\n";
1894 
1895  throw std::logic_error(buffer.str());
1896  }
1897 
1898  const size_t size = instance.size();
1899  const size_t variables_number = variables.get_variables_number();
1900 
1901  if(size != variables_number)
1902  {
1903  std::ostringstream buffer;
1904 
1905  buffer << "OpenNN Exception: DataSet class.\n"
1906  << "void set_instance(const size_t&, const Vector<double>&) method.\n"
1907  << "Size (" << size << ") must be equal to number of variables (" << variables_number << ").\n";
1908 
1909  throw std::logic_error(buffer.str());
1910  }
1911 
1912  #endif
1913 
1914  // Set instance
1915 
1916  data.set_row(instance_index, instance);
1917 }
1918 
1919 
1920 // void add_instance(const Vector<double>&) method
1921 
1927 
1929 {
1930  // Control sentence (if debug)
1931 
1932  #ifdef __OPENNN_DEBUG__
1933 
1934  const size_t size = instance.size();
1935  const size_t variables_number = variables.get_variables_number();
1936 
1937  if(size != variables_number)
1938  {
1939  std::ostringstream buffer;
1940 
1941  buffer << "OpenNN Exception: DataSet class.\n"
1942  << "void add_instance(const Vector<double>&) method.\n"
1943  << "Size of instance must be equal to number of variables.\n";
1944 
1945  throw std::logic_error(buffer.str());
1946  }
1947 
1948  #endif
1949 
1950  const size_t instances_number = instances.get_instances_number();
1951 
1952  data.append_row(instance);
1953 
1954  instances.set(instances_number+1);
1955 }
1956 
1957 
1958 // void subtract_instance(size_t) method
1959 
1964 
1965 void DataSet::subtract_instance(const size_t& instance_index)
1966 {
1967  const size_t instances_number = instances.get_instances_number();
1968 
1969  // Control sentence (if debug)
1970 
1971  #ifdef __OPENNN_DEBUG__
1972 
1973  if(instance_index >= instances_number)
1974  {
1975  std::ostringstream buffer;
1976 
1977  buffer << "OpenNN Exception: DataSet class.\n"
1978  << "void subtract_instance(size_t) method.\n"
1979  << "Index of instance must be less than number of instances.\n";
1980 
1981  throw std::logic_error(buffer.str());
1982  }
1983 
1984  #endif
1985 
1986  data.subtract_row(instance_index);
1987 
1988  instances.set_instances_number(instances_number-1);
1989 
1990 }
1991 
1992 
1993 // void append_variable(const Vector<double>&) method
1994 
1997 
1999 {
2000  // Control sentence (if debug)
2001 
2002  #ifdef __OPENNN_DEBUG__
2003 
2004  const size_t size = variable.size();
2005  const size_t instances_number = instances.get_instances_number();
2006 
2007  if(size != instances_number)
2008  {
2009  std::ostringstream buffer;
2010 
2011  buffer << "OpenNN Exception: DataSet class.\n"
2012  << "void append_variable(const Vector<double>&) method.\n"
2013  << "Size of variable must be equal to number of instances.\n";
2014 
2015  throw std::logic_error(buffer.str());
2016  }
2017 
2018  #endif
2019 
2020  const size_t variables_number = variables.get_variables_number();
2021 
2022  data.append_column(variable);
2023 
2024  Matrix<double> new_data(data);
2025 
2026  const size_t new_variables_number = variables_number + 1;
2027 
2028  set_variables_number(new_variables_number);
2029 
2030  set_data(new_data);
2031 }
2032 
2033 
2034 // void subtract_variable(size_t) method
2035 
2038 
2039 void DataSet::subtract_variable(const size_t& variable_index)
2040 {
2041  const size_t variables_number = variables.get_variables_number();
2042 
2043  // Control sentence (if debug)
2044 
2045  #ifdef __OPENNN_DEBUG__
2046 
2047  if(variable_index >= variables_number)
2048  {
2049  std::ostringstream buffer;
2050 
2051  buffer << "OpenNN Exception: DataSet class.\n"
2052  << "void subtract_variable(size_t) method.\n"
2053  << "Index of variable must be less than number of variables.\n";
2054 
2055  throw std::logic_error(buffer.str());
2056  }
2057 
2058  #endif
2059 
2060  data.subtract_column(variable_index);
2061 
2062  Matrix<double> new_data(data);
2063 
2064  const size_t new_variables_number = variables_number - 1;
2065 
2066  set_variables_number(new_variables_number);
2067 
2068  set_data(new_data);
2069 }
2070 
2071 
2072 // Vector<size_t> unuse_constant_variables(void) method
2073 
2076 
2078 {
2079  const size_t variables_number = variables.get_variables_number();
2080 
2081  // Control sentence (if debug)
2082 
2083  #ifdef __OPENNN_DEBUG__
2084 
2085  if(variables_number == 0)
2086  {
2087  std::ostringstream buffer;
2088 
2089  buffer << "OpenNN Exception: DataSet class.\n"
2090  << "Vector<size_t> unuse_constant_variables(void) method.\n"
2091  << "Number of variables is zero.\n";
2092 
2093  throw std::logic_error(buffer.str());
2094  }
2095 
2096  #endif
2097 
2098  const Vector< Statistics<double> > statistics = data.calculate_statistics();
2099 
2100  Vector<size_t> constant_variables;
2101 
2102  for(size_t i = 0; i < variables_number; i++)
2103  {
2104  if(variables.get_use(i) == Variables::Input && statistics[i].standard_deviation < 1.0e-6)
2105  {
2106  variables.set_use(i, Variables::Unused);
2107  constant_variables.push_back(i);
2108  }
2109  }
2110 
2111  return(constant_variables);
2112 }
2113 
2114 
2115 // Vector<size_t> unuse_repeated_instances(void) method
2116 
2119 
2121 {
2122  const size_t instances_number = instances.get_instances_number();
2123 
2124  // Control sentence (if debug)
2125 
2126  #ifdef __OPENNN_DEBUG__
2127 
2128  if(instances_number == 0)
2129  {
2130  std::ostringstream buffer;
2131 
2132  buffer << "OpenNN Exception: DataSet class.\n"
2133  << "Vector<size_t> unuse_repeated_indices(void) method.\n"
2134  << "Number of instances is zero.\n";
2135 
2136  throw std::logic_error(buffer.str());
2137  }
2138 
2139  #endif
2140 
2141  Vector<size_t> repeated_instances;
2142 
2143  Vector<double> instance_i;
2144  Vector<double> instance_j;
2145 
2146  int i = 0;
2147 
2148  #pragma omp parallel for private(i, instance_i, instance_j) schedule(dynamic)
2149 
2150  for(i = 0; i < (int)instances_number; i++)
2151  {
2152  instance_i = get_instance(i);
2153 
2154  for(size_t j = i+1; j < instances_number; j++)
2155  {
2156  instance_j = get_instance(j);
2157 
2158  if(instances.get_use(j) != Instances::Unused
2159  && instance_j == instance_i)
2160  {
2161  instances.set_use(j, Instances::Unused);
2162  repeated_instances.push_back(j);
2163  }
2164  }
2165  }
2166 
2167  return(repeated_instances);
2168 }
2169 
2170 
2171 // Vector<size_t> unuse_non_significant_inputs(void)
2172 
2174 
2176 {
2177  const Vector<size_t> inputs_indices = get_variables_pointer()->arrange_inputs_indices();
2178  const size_t inputs_number = inputs_indices.size();
2179 
2180  const size_t target_index = get_variables_pointer()->arrange_targets_indices()[0];
2181 
2182  const size_t instances_number = get_instances_pointer()->count_used_instances_number();
2183 
2184  Vector<size_t> non_significant_variables;
2185 
2187  {
2188  return non_significant_variables;
2189  }
2190 
2191  size_t positives = 0;
2192 
2193  size_t current_input_index;
2194 
2195  for(size_t i = 0; i < inputs_number; i++)
2196  {
2197  positives = 0;
2198 
2199  current_input_index = inputs_indices[i];
2200 
2201  if(!is_binary_variable(current_input_index))
2202  {
2203  continue;
2204  }
2205 
2206  for(size_t j = 0; j < instances_number; j++)
2207  {
2208  if(data(j, current_input_index) == 1.0 && data(j, target_index) == 1.0)
2209  {
2210  positives++;
2211  }
2212  }
2213 
2214  if(positives == 0)
2215  {
2216  variables.set_use(current_input_index, Variables::Unused);
2217  non_significant_variables.push_back(current_input_index);
2218  }
2219  }
2220 
2221  return non_significant_variables;
2222 }
2223 
2224 
2225 
2226 // Vector<Histogram> calculate_data_histograms(const size_t&) const method
2227 
2235 
2237 {
2238  const size_t used_variables_number = variables.count_used_variables_number();
2239  const Vector<size_t> used_variables_indices = variables.arrange_used_indices();
2240  const size_t used_instances_number = instances.count_used_instances_number();
2241  const Vector<size_t> used_instances_indices = instances.arrange_used_indices();
2242 
2244 
2245  Vector< Histogram<double> > histograms(used_variables_number);
2246 
2247  Vector<double> column(used_instances_number);
2248 
2249  int i = 0;
2250 
2251  #pragma omp parallel for private(i, column) shared(histograms)
2252 
2253  for(i = 0; i < (int)used_variables_number; i++)
2254  {
2255  column = data.arrange_column(used_variables_indices[i], used_instances_indices);
2256 
2257  if (column.is_binary())
2258  {
2259  histograms[i] = column.calculate_histogram_binary();
2260  }
2261  else
2262  {
2263  histograms[i] = column.calculate_histogram(bins_number);
2264 // histograms[i] = column.calculate_histogram_missing_values(missing_indices[i], bins_number);
2265  }
2266  }
2267 
2268  return(histograms);
2269 }
2270 
2271 
2272 // Vector<Histogram> calculate_targets_histograms(const size_t&) const method
2273 
2281 
2283 {
2284  const size_t targets_number = variables.count_targets_number();
2285 
2286  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
2287 
2288  const size_t used_instances_number = instances.count_used_instances_number();
2289  const Vector<size_t> used_instances_indices = instances.arrange_used_indices();
2290 
2292 
2293  Vector< Histogram<double> > histograms(targets_number);
2294 
2295  Vector<double> column(used_instances_number);
2296 
2297  for(size_t i = 0; i < targets_number; i++)
2298  {
2299  column = data.arrange_column(targets_indices[i], used_instances_indices);
2300 
2301  histograms[i] = column.calculate_histogram_missing_values(missing_indices[i], bins_number);
2302  }
2303 
2304  return(histograms);
2305 }
2306 
2307 
2308 // Vector<double> calculate_box_and_whiskers(void) const
2309 
2320 
2322 {
2323  const size_t variables_number = variables.count_used_variables_number();
2324  const Vector<size_t> variables_indices = variables.arrange_used_indices();
2325 
2326  const size_t instances_number = instances.count_used_instances_number();
2327  const Vector<size_t> instances_indices = instances.arrange_used_indices();
2328 
2330 
2331  Vector< Vector<double> > box_plots;
2332  box_plots.set(variables_number);
2333 
2334  Vector<double> column(instances_number);
2335 
2336 #pragma omp parallel for private(column)
2337  for(int i = 0; i < (int)variables_number; i++)
2338  {
2339  column = data.arrange_column(variables_indices[i], instances_indices);
2340 
2341  box_plots[i] = column.calculate_box_plots_missing_values(missing_indices[i]);
2342  }
2343 
2344  return(box_plots);
2345 }
2346 
2347 
2348 // size_t calculate_training_negatives(const size_t&) const method
2349 
2352 
2353 size_t DataSet::calculate_training_negatives(const size_t& target_index) const
2354 {
2355  size_t negatives = 0;
2356 
2357  const size_t training_instances_number = instances.count_training_instances_number();
2358 
2359  Vector<size_t> training_indices = instances.arrange_training_indices();
2360 
2361  size_t training_index;
2362 
2363  for(size_t i = 0; i < training_instances_number; i++)
2364  {
2365  training_index = training_indices[i];
2366 
2367  if(data(training_index, target_index) == 0.0)
2368  {
2369  negatives++;
2370  }
2371  else if(data(training_index, target_index) != 1.0)
2372  {
2373  std::ostringstream buffer;
2374 
2375  buffer << "OpenNN Exception: DataSet class.\n"
2376  << "size_t calculate_training_negatives(const size_t&) const method.\n"
2377  << "Training instance is neither a positive nor a negative: " << data(training_index, target_index) << std::endl;
2378 
2379  throw std::logic_error(buffer.str());
2380  }
2381  }
2382 
2383  return(negatives);
2384 }
2385 
2386 
2387 // size_t calculate_selection_negatives(const size_t&) const method
2388 
2391 
2392 size_t DataSet::calculate_selection_negatives(const size_t& target_index) const
2393 {
2394  size_t negatives = 0;
2395 
2396  const size_t selection_instances_number = instances.count_selection_instances_number();
2397 
2398  Vector<size_t> selection_indices = instances.arrange_selection_indices();
2399 
2400  size_t selection_index;
2401 
2402  for(size_t i = 0; i < selection_instances_number; i++)
2403  {
2404  selection_index = selection_indices[i];
2405 
2406  if(data(selection_index, target_index) == 0.0)
2407  {
2408  negatives++;
2409  }
2410  else if(data(selection_index, target_index) != 1.0)
2411  {
2412  std::ostringstream buffer;
2413 
2414  buffer << "OpenNN Exception: DataSet class.\n"
2415  << "size_t calculate_selection_negatives(const size_t&) const method.\n"
2416  << "Selection instance is neither a positive nor a negative: " << data(selection_index, target_index) << std::endl;
2417 
2418  throw std::logic_error(buffer.str());
2419  }
2420  }
2421 
2422  return(negatives);
2423 }
2424 
2425 
2426 // size_t calculate_testing_negatives(const size_t&) const method
2427 
2430 
2431 size_t DataSet::calculate_testing_negatives(const size_t& target_index) const
2432 {
2433  size_t negatives = 0;
2434 
2435  const size_t testing_instances_number = instances.count_testing_instances_number();
2436 
2437  Vector<size_t> testing_indices = instances.arrange_testing_indices();
2438 
2439  size_t testing_index;
2440 
2441  for(size_t i = 0; i < testing_instances_number; i++)
2442  {
2443  testing_index = testing_indices[i];
2444 
2445  if(data(testing_index, target_index) == 0.0)
2446  {
2447  negatives++;
2448  }
2449  else if(data(testing_index, target_index) != 1.0)
2450  {
2451  std::ostringstream buffer;
2452 
2453  buffer << "OpenNN Exception: DataSet class.\n"
2454  << "size_t calculate_selection_negatives(const size_t&) const method.\n"
2455  << "Testing instance is neither a positive nor a negative: " << data(testing_index, target_index) << std::endl;
2456 
2457  throw std::logic_error(buffer.str());
2458  }
2459  }
2460 
2461  return(negatives);
2462 }
2463 
2464 
2465 // Vector< Vector<double> > calculate_data_statistics(void) const method
2466 
2475 
2477 {
2479 
2480  return(data.calculate_statistics_missing_values(missing_indices));
2481 }
2482 
2483 
2484 // Vector < Vector<double> > calculate_data_shape_parameters(void) const method
2485 
2492 
2494 {
2496 
2497  return(data.calculate_shape_parameters_missing_values(missing_indices));
2498 }
2499 
2500 
2501 // Matrix<double> calculate_data_statistics_matrix(void) const method
2502 
2506 
2508 {
2510 
2511  const Vector<size_t> used_variables_indices = variables.arrange_used_indices();
2512  const Vector<size_t> used_instances_indices = instances.arrange_used_indices();
2513 
2514  const size_t variables_number = used_variables_indices.size();//variables.count_used_variables_number();
2515 
2516  Matrix<double> data_statistics_matrix(variables_number, 4);
2517 
2518  for(size_t i = 0; i < variables_number; i++)
2519  {
2520  const size_t variable_index = used_variables_indices[i];
2521 
2522  const Vector<double> variable_data = data.arrange_column(variable_index, used_instances_indices);
2523 
2524  const Statistics<double> data_statistics = variable_data.calculate_statistics_missing_values(missing_indices[variable_index]);
2525 
2526  data_statistics_matrix.set_row(i, data_statistics.to_vector());
2527  }
2528 
2529  return(data_statistics_matrix);
2530 }
2531 
2532 // Matrix<double> calculate_positives_data_statistics_matrix(void) const method
2533 
2535 
2537 {
2538 #ifdef __OPENNN_DEBUG__
2539 
2540  const size_t targets_number = variables.count_targets_number();
2541 
2542  if(targets_number != 1)
2543  {
2544  std::ostringstream buffer;
2545 
2546  buffer << "OpenNN Exception: DataSet class.\n"
2547  << "Matrix<double> calculate_positives_data_statistics_matrix(void) const method.\n"
2548  << "Number of targets muste be 1.\n";
2549 
2550  throw std::logic_error(buffer.str());
2551  }
2552 #endif
2553 
2554  const size_t target_index = variables.arrange_targets_indices()[0];
2555 
2556  const Vector<size_t> used_instances_indices = instances.arrange_used_indices();
2557 
2558  const Vector<double> targets = data.arrange_column(target_index, used_instances_indices);
2559 
2560 #ifdef __OPENNN_DEBUG__
2561 
2562  if(!targets.is_binary())
2563  {
2564  std::ostringstream buffer;
2565 
2566  buffer << "OpenNN Exception: DataSet class.\n"
2567  << "Matrix<double> calculate_positives_data_statistics_matrix(void) const method.\n"
2568  << "Targets vector must be binary.\n";
2569 
2570  throw std::logic_error(buffer.str());
2571  }
2572 #endif
2573 
2574  const Vector<size_t> inputs_variables_indices = variables.arrange_inputs_indices();
2575 
2577 
2578  const size_t inputs_number = inputs_variables_indices.size();
2579 
2580  const Vector<size_t> positives_used_instances_indices = used_instances_indices.arrange_subvector(targets.calculate_equal_than_indices(1.0));
2581 
2582  Matrix<double> data_statistics_matrix(inputs_number, 4);
2583 
2584  for(size_t i = 0; i < inputs_number; i++)
2585  {
2586  const size_t variable_index = inputs_variables_indices[i];
2587 
2588  const Vector<double> variable_data = data.arrange_column(variable_index, positives_used_instances_indices);
2589 
2590  const Statistics<double> data_statistics = variable_data.calculate_statistics_missing_values(missing_indices[variable_index]);
2591 
2592  data_statistics_matrix.set_row(i, data_statistics.to_vector());
2593  }
2594  return data_statistics_matrix;
2595 }
2596 
2597 // Matrix<double> calculate_negatives_data_statistics_matrix(void) const method
2598 
2600 
2602 {
2603 #ifdef __OPENNN_DEBUG__
2604 
2605  const size_t targets_number = variables.count_targets_number();
2606 
2607  if(targets_number != 1)
2608  {
2609  std::ostringstream buffer;
2610 
2611  buffer << "OpenNN Exception: DataSet class.\n"
2612  << "Matrix<double> calculate_positives_data_statistics_matrix(void) const method.\n"
2613  << "Number of targets muste be 1.\n";
2614 
2615  throw std::logic_error(buffer.str());
2616  }
2617 #endif
2618 
2619  const size_t target_index = variables.arrange_targets_indices()[0];
2620 
2621  const Vector<size_t> used_instances_indices = instances.arrange_used_indices();
2622 
2624 
2625  const Vector<double> targets = data.arrange_column(target_index, used_instances_indices);
2626 
2627 #ifdef __OPENNN_DEBUG__
2628 
2629  if(!targets.is_binary())
2630  {
2631  std::ostringstream buffer;
2632 
2633  buffer << "OpenNN Exception: DataSet class.\n"
2634  << "Matrix<double> calculate_positives_data_statistics_matrix(void) const method.\n"
2635  << "Targets vector must be binary.\n";
2636 
2637  throw std::logic_error(buffer.str());
2638  }
2639 #endif
2640 
2641  const Vector<size_t> inputs_variables_indices = variables.arrange_inputs_indices();
2642 
2643  const size_t inputs_number = inputs_variables_indices.size();
2644 
2645  const Vector<size_t> negatives_used_instances_indices = used_instances_indices.arrange_subvector(targets.calculate_equal_than_indices(0.0));
2646 
2647  Matrix<double> data_statistics_matrix(inputs_number, 4);
2648 
2649  for(size_t i = 0; i < inputs_number; i++)
2650  {
2651  const size_t variable_index = inputs_variables_indices[i];
2652 
2653  const Vector<double> variable_data = data.arrange_column(variable_index, negatives_used_instances_indices);
2654 
2655  const Statistics<double> data_statistics = variable_data.calculate_statistics_missing_values(missing_indices[variable_index]);
2656 
2657  data_statistics_matrix.set_row(i, data_statistics.to_vector());
2658  }
2659  return data_statistics_matrix;
2660 }
2661 
2662 // Matrix<double> calculate_data_shape_parameters_matrix(void) const method
2663 
2667 
2669 {
2671 
2672  const Vector<size_t> used_variables_indices = variables.arrange_used_indices();
2673  const Vector<size_t> used_instances_indices = instances.arrange_used_indices();
2674 
2675  const Vector< Vector<size_t> > used_missing_indices = missing_indices.arrange_subvector(used_variables_indices);
2676 
2677  const size_t variables_number = variables.count_used_variables_number();
2678 
2679  Matrix<double> data_shape_parameters_matrix(variables_number, 2);
2680 
2681  for(size_t i = 0; i < variables_number; i++)
2682  {
2683  const size_t variable_index = used_variables_indices[i];
2684 
2685  const Vector<double> variable_data = data.arrange_column(variable_index, used_instances_indices);
2686 
2687  const Vector<double> shape_parameters = variable_data.calculate_shape_parameters_missing_values(used_missing_indices[variable_index]);
2688 
2689  data_shape_parameters_matrix.set_row(i, shape_parameters);
2690  }
2691 
2692  return(data_shape_parameters_matrix);
2693 }
2694 
2695 
2696 // Vector< Vector<double> > calculate_training_instances_statistics(void) const method
2697 
2706 
2708 {
2709  const Vector<size_t> training_indices = instances.arrange_training_indices();
2710 
2712 
2713  return(data.calculate_rows_statistics_missing_values(training_indices, missing_indices));
2714 }
2715 
2716 
2717 // Vector< Vector<double> > calculate_selection_instances_statistics(void) const method
2718 
2727 
2729 {
2730  const Vector<size_t> selection_indices = instances.arrange_selection_indices();
2731 
2733 
2734  return(data.calculate_rows_statistics_missing_values(selection_indices, missing_indices));
2735 }
2736 
2737 
2738 // Vector< Vector<double> > calculate_testing_instances_statistics(void) const method
2739 
2748 
2750 {
2751  const Vector<size_t> testing_indices = instances.arrange_testing_indices();
2752 
2754 
2755  return(data.calculate_rows_statistics_missing_values(testing_indices, missing_indices));
2756 }
2757 
2758 
2759 // Vector< Vector<double> > calculate_training_instances_shape_parameters(void) const method
2760 
2767 
2769 {
2770  const Vector<size_t> training_indices = instances.arrange_training_indices();
2771 
2773 
2774  return(data.calculate_rows_shape_parameters_missing_values(training_indices, missing_indices));
2775 }
2776 
2777 
2778 // Vector< Vector<double> > calculate_selection_instances_shape_parameters(void) const method
2779 
2786 
2788 {
2789  const Vector<size_t> selection_indices = instances.arrange_selection_indices();
2790 
2792 
2793  return(data.calculate_rows_shape_parameters_missing_values(selection_indices, missing_indices));
2794 }
2795 
2796 
2797 // Vector< Vector<double> > calculate_testing_instances_shape_parameters(void) const method
2798 
2805 
2807 {
2808  const Vector<size_t> testing_indices = instances.arrange_testing_indices();
2809 
2811 
2812  return(data.calculate_rows_shape_parameters_missing_values(testing_indices, missing_indices));
2813 }
2814 
2815 
2816 
2817 // Vector< Statistics<double> > calculate_inputs_statistics(void) const method
2818 
2827 
2829 {
2830  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
2831 
2833 
2834  const Vector<size_t> unused_instances_indices = instances.arrange_unused_indices();
2835 
2836  for (size_t i = 0; i < unused_instances_indices.size(); i++)
2837  {
2838  for (size_t j = 0; j < missing_indices.size(); j++)
2839  {
2840  missing_indices[j].push_back(unused_instances_indices[i]);
2841  }
2842  }
2843 
2844  return(data.calculate_columns_statistics_missing_values(inputs_indices, missing_indices));
2845 }
2846 
2847 
2848 // Vector< Statistics<double> > calculate_targets_statistics(void) const method
2849 
2858 
2860 {
2861  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
2862 
2864 
2865  const Vector<size_t> unused_instances_indices = instances.arrange_unused_indices();
2866 
2867  for (size_t i = 0; i < unused_instances_indices.size(); i++)
2868  {
2869  for (size_t j = 0; j < missing_indices.size(); j++)
2870  {
2871  missing_indices[j].push_back(unused_instances_indices[i]);
2872  }
2873  }
2874 
2875  return(data.calculate_columns_statistics_missing_values(targets_indices, missing_indices));
2876 }
2877 
2878 
2879 // Vector<double> calculate_training_target_data_mean(void) const method
2880 
2882 
2884 {
2885  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
2886 
2887  const Vector<size_t> training_indices = instances.arrange_training_indices();
2888 
2890 
2891  return(data.calculate_mean_missing_values(training_indices, targets_indices, missing_indices));
2892 }
2893 
2894 
2895 // Vector<double> calculate_selection_target_data_mean(void) const method
2896 
2898 
2900 {
2901  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
2902 
2903  const Vector<size_t> selection_indices = instances.arrange_selection_indices();
2904 
2906 
2907  return(data.calculate_mean_missing_values(selection_indices, targets_indices, missing_indices));
2908 }
2909 
2910 
2911 // Vector<double> calculate_testing_target_data_mean(void) const method
2912 
2914 
2916 {
2917  const Vector<size_t> testing_indices = instances.arrange_testing_indices();
2918 
2919  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
2920 
2922 
2923  return(data.calculate_mean_missing_values(testing_indices, targets_indices, missing_indices));
2924 }
2925 
2926 
2927 // Matrix<double> calculate_linear_correlations(void) const method
2928 
2932 
2934 {
2935  const size_t inputs_number = variables.count_inputs_number();
2936  const size_t targets_number = variables.count_targets_number();
2937 
2938  const Vector<size_t> input_indices = variables.arrange_inputs_indices();
2939  const Vector<size_t> target_indices = variables.arrange_targets_indices();
2940 
2941  size_t input_index;
2942  size_t target_index;
2943 
2944  const size_t instances_number = instances.get_instances_number();
2945 
2946  Vector<double> input_variable(instances_number);
2947  Vector<double> target_variable(instances_number);
2948 
2949  Matrix<double> linear_correlations(inputs_number, targets_number);
2950 
2951 #ifndef __OPENNN_MPI__
2952 #pragma omp parallel for private(input_index, input_variable, target_index, target_variable)
2953 #endif
2954  for(int i = 0; i < (int)inputs_number; i++)
2955  {
2956  for(int j = 0; j < (int)targets_number; j++)
2957  {
2958  input_index = input_indices[i];
2959 
2960  input_variable = data.arrange_column(input_index);
2961 
2962  target_index = target_indices[j];
2963 
2964  target_variable = data.arrange_column(target_index);
2965 
2966  linear_correlations(i,j) = input_variable.calculate_linear_correlation(target_variable);
2967  }
2968  }
2969 
2970  return(linear_correlations);
2971 }
2972 
2973 
2974 // Matrix<double> calculate_covariance_matrix(void) const method
2975 
2979 
2981 {
2982  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
2983  const Vector<size_t> used_instances_indices = instances.arrange_used_indices();
2984 
2985  const size_t inputs_number = variables.count_inputs_number();
2986 
2987  Matrix<double> covariance_matrix(inputs_number, inputs_number, 0.0);
2988 
2989 #pragma omp parallel for
2990 
2991  for(int i = 0; i < (int)inputs_number; i++)
2992  {
2993  const size_t first_input_index = inputs_indices[i];
2994 
2995  const Vector<double> first_input_data = data.arrange_column(first_input_index, used_instances_indices);
2996 
2997  for(size_t j = 0; j < inputs_number; j++)
2998  {
2999  const size_t second_input_index = inputs_indices[j];
3000 
3001  const Vector<double> second_input_data = data.arrange_column(second_input_index, used_instances_indices);
3002 
3003  covariance_matrix(i,j) = first_input_data.calculate_covariance(second_input_data);
3004  covariance_matrix(j,i) = covariance_matrix(i,j);
3005  }
3006  }
3007 
3008  return (covariance_matrix);
3009 }
3010 
3011 
3012 // Matrix<double> perform_principal_components_analysis(const double& = 0.0) mehtod
3013 
3018 
3019 Matrix<double> DataSet::perform_principal_components_analysis(const double& minimum_explained_variance)
3020 {
3021  //const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
3022 
3023  // Subtract off the mean
3024 
3026 
3027  // Calculate covariance matrix
3028 
3029  const Matrix<double> covariance_matrix = calculate_covariance_matrix();
3030 
3031  // Calculate eigenvectors
3032 
3033  const Matrix<double> eigenvectors = covariance_matrix.calculate_eigenvectors();
3034 
3035  // Calculate eigenvalues
3036 
3037  const Matrix<double> eigenvalues = covariance_matrix.calculate_eigenvalues();
3038 
3039  // Calculate explained variance
3040 
3041  const Vector<double> explained_variance = eigenvalues.arrange_column(0).calculate_explained_variance();
3042 
3043  // Sort principal components
3044 
3045  const Vector<size_t> sorted_principal_components_indices = explained_variance.sort_greater_indices();
3046 
3047  // Choose eigenvectors
3048 
3049  const size_t inputs_number = covariance_matrix.get_columns_number();
3050 
3051  Vector<size_t> principal_components_indices;
3052 
3053  size_t index;
3054 
3055  for(size_t i = 0; i < inputs_number; i++)
3056  {
3057  index = sorted_principal_components_indices[i];
3058 
3059  if(explained_variance[index] >= minimum_explained_variance)
3060  {
3061  principal_components_indices.push_back(i);
3062  }
3063  else
3064  {
3065  continue;
3066  }
3067  }
3068 
3069  const size_t principal_components_number = principal_components_indices.size();
3070 
3071  // Arrange principal components matrix
3072 
3073  Matrix<double> principal_components;
3074 
3075  if(principal_components_number == 0)
3076  {
3077  return principal_components;
3078  }
3079  else
3080  {
3081  principal_components.set(principal_components_number, inputs_number);
3082  }
3083 
3084  for(size_t i = 0; i < principal_components_number; i++)
3085  {
3086  index = sorted_principal_components_indices[i];
3087 
3088  principal_components.set_row(i, eigenvectors.arrange_column(index));
3089  }
3090 
3091  // Return feature matrix
3092 
3093  return principal_components.arrange_submatrix_rows(principal_components_indices);
3094 }
3095 
3096 
3097 // Matrix<double> perform_principal_components_analysis(const Matrix<double>&, const Matrix<double>&, const Vector<double>&, const double& = 0.0) mehtod
3098 
3105 
3107  const Vector<double>& explained_variance,
3108  const double& minimum_explained_variance)
3109 {
3110  // Subtract off the mean
3111 
3113 
3114  // Calculate eigenvectors
3115 
3116  const Matrix<double> eigenvectors = covariance_matrix.calculate_eigenvectors();
3117 
3118  // Sort principal components
3119 
3120  Vector<size_t> sorted_principal_components_indices = explained_variance.sort_greater_indices();
3121 
3122  // Choose eigenvectors
3123 
3124  const size_t inputs_number = covariance_matrix.get_columns_number();
3125 
3126  Vector<size_t> principal_components_indices;
3127 
3128  size_t index;
3129 
3130  for(size_t i = 0; i < inputs_number; i++)
3131  {
3132  index = sorted_principal_components_indices[i];
3133 
3134  if(explained_variance[index] >= minimum_explained_variance)
3135  {
3136  principal_components_indices.push_back(i);
3137  }
3138  else
3139  {
3140  continue;
3141  }
3142  }
3143 
3144  const size_t principal_components_number = principal_components_indices.size();
3145 
3146  // Arrange principal components matrix
3147 
3148  Matrix<double> principal_components;
3149 
3150  if(principal_components_number == 0)
3151  {
3152  return principal_components;
3153  }
3154  else
3155  {
3156  principal_components.set(principal_components_number, inputs_number);
3157  }
3158 
3159  for(size_t i = 0; i < principal_components_number; i++)
3160  {
3161  index = sorted_principal_components_indices[i];
3162 
3163  principal_components.set_row(i, eigenvectors.arrange_column(index));
3164  }
3165 
3166  // Return feature matrix
3167 
3168  return principal_components.arrange_submatrix_rows(principal_components_indices);
3169 }
3170 
3171 
3172 // void transform_principal_components_data(const Matrix<double>&);
3173 
3176 
3178 {
3179  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
3180 
3181  const Matrix<double> target_data = arrange_target_data();
3182 
3184 
3185  const size_t principal_components_number = principal_components.get_rows_number();
3186 
3187  // Transform data
3188 
3189  const Vector<size_t> used_instances = instances.arrange_used_indices();
3190 
3191  const size_t new_instances_number = used_instances.size();
3192 
3193  const Matrix<double> input_data = arrange_input_data();
3194 
3195  Matrix<double> new_data(new_instances_number, principal_components_number, 0.0);
3196 
3197  size_t instance_index;
3198 
3199  for(size_t i = 0; i < new_instances_number; i++)
3200  {
3201  instance_index = used_instances[i];
3202 
3203  for(size_t j = 0; j < principal_components_number; j++)
3204  {
3205  new_data(i,j) = input_data.arrange_row(instance_index).dot(principal_components.arrange_row(j));
3206  }
3207  }
3208 
3209  data = new_data.assemble_columns(target_data);
3210 }
3211 
3212 
3213 // void scale_data_mean_standard_deviation(const Vector< Statistics<double> >&) const method
3214 
3219 
3221 {
3222  // Control sentence (if debug)
3223 
3224  #ifdef __OPENNN_DEBUG__
3225 
3226  std::ostringstream buffer;
3227 
3228  const size_t columns_number = data.get_columns_number();
3229 
3230  const size_t statistics_size = data_statistics.size();
3231 
3232  if(statistics_size != columns_number)
3233  {
3234  buffer << "OpenNN Exception: DataSet class.\n"
3235  << "void scale_data_mean_standard_deviation(const Vector< Statistics<double> >&) method.\n"
3236  << "Size of statistics must be equal to number of columns.\n";
3237 
3238  throw std::logic_error(buffer.str());
3239  }
3240 
3241  #endif
3242 
3243  const size_t variables_number = variables.get_variables_number();
3244 
3245  for(size_t i = 0; i < variables_number; i++)
3246  {
3247  if(display && data_statistics[i].standard_deviation < 1.0e-99)
3248  {
3249  std::cout << "OpenNN Warning: DataSet class.\n"
3250  << "void scale_data_mean_standard_deviation(const Vector< Statistics<Type> >&) method.\n"
3251  << "Standard deviation of variable " << i << " is zero.\n"
3252  << "That variable won't be scaled.\n";
3253  }
3254  }
3255 
3256  data.scale_mean_standard_deviation(data_statistics);
3257 }
3258 
3259 
3260 // Vector< Statistics<double> > scale_data_minimum_maximum(void) method
3261 
3265 
3267 {
3268  const Vector< Statistics<double> > data_statistics = calculate_data_statistics();
3269 
3270  scale_data_minimum_maximum(data_statistics);
3271 
3272  return(data_statistics);
3273 }
3274 
3275 
3276 // Vector< Statistics<double> > scale_data_mean_standard_deviation(void) method
3277 
3281 
3283 {
3284  const Vector< Statistics<double> > data_statistics = calculate_data_statistics();
3285 
3286  scale_data_mean_standard_deviation(data_statistics);
3287 
3288  return(data_statistics);
3289 }
3290 
3291 
3292 // void subtract_input_data_mean(void) method
3293 
3295 
3297 {
3299 
3300  Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
3301  Vector<size_t> used_instances_indices = instances.arrange_used_indices();
3302 
3303  size_t input_index;
3304  size_t instance_index;
3305 
3306  double input_mean;
3307 
3308  for(size_t i = 0; i < inputs_indices.size(); i++)
3309  {
3310  input_index = inputs_indices[i];
3311 
3312  input_mean = input_statistics[i/*input_index*/].mean;
3313 
3314  for(size_t j = 0; j < used_instances_indices.size(); j++)
3315  {
3316  instance_index = used_instances_indices[j];
3317 
3318  data(instance_index,input_index) = data(instance_index,input_index) - input_mean;
3319  }
3320  }
3321 }
3322 
3323 
3324 // void scale_data_minimum_maximum(const Vector< Statistics<double> >&) method
3325 
3330 
3332 {
3333  const size_t variables_number = variables.get_variables_number();
3334 
3335  // Control sentence (if debug)
3336 
3337  #ifdef __OPENNN_DEBUG__
3338 
3339  std::ostringstream buffer;
3340 
3341  const size_t statistics_size = data_statistics.size();
3342 
3343  if(statistics_size != variables_number)
3344  {
3345  buffer << "OpenNN Exception: DataSet class.\n"
3346  << "void scale_data_minimum_maximum(const Vector< Statistics<double> >&) method.\n"
3347  << "Size of data statistics must be equal to number of variables.\n";
3348 
3349  throw std::logic_error(buffer.str());
3350  }
3351 
3352  #endif
3353 
3354  for(size_t i = 0; i < variables_number; i++)
3355  {
3356  if(display && data_statistics[i].maximum-data_statistics[i].minimum < 1.0e-99)
3357  {
3358  std::cout << "OpenNN Warning: DataSet class.\n"
3359  << "void scale_data_minimum_maximum(const Vector< Statistics<Type> >&) method.\n"
3360  << "Range of variable " << i << " is zero.\n"
3361  << "That variable won't be scaled.\n";
3362  }
3363  }
3364 
3365 
3366  data.scale_minimum_maximum(data_statistics);
3367 }
3368 
3369 /*
3370 // void scale_data(const std::string&, const Vector< Statistics<double> >&) method
3371 
3378 
3379 void DataSet::scale_data(const std::string& scaling_unscaling_method_string, const Vector< Statistics<double> >& data_statistics)
3380 {
3381  switch(get_scaling_unscaling_method(scaling_unscaling_method_string))
3382  {
3383  case MinimumMaximum:
3384  {
3385  scale_data_minimum_maximum(data_statistics);
3386  }
3387  break;
3388 
3389  case MeanStandardDeviation:
3390  {
3391  scale_data_mean_standard_deviation(data_statistics);
3392  }
3393  break;
3394 
3395  default:
3396  {
3397  std::ostringstream buffer;
3398 
3399  buffer << "OpenNN Exception: DataSet class\n"
3400  << "void scale_data(const std::string&, const Vector< Vector<double> >&) method.\n"
3401  << "Unknown data scaling and unscaling method.\n";
3402 
3403  throw std::logic_error(buffer.str());
3404  }
3405  break;
3406  }
3407 }
3408 
3409 
3410 // Vector< Statistics<double> > scale_data(void) method
3411 
3414 
3415 Vector< Statistics<double> > DataSet::scale_data(const std::string& scaling_unscaling_method)
3416 {
3417  const Vector< Statistics<double> > statistics = data.calculate_statistics();
3418 
3419  switch(get_scaling_unscaling_method(scaling_unscaling_method))
3420  {
3421  case MinimumMaximum:
3422  {
3423  scale_data_minimum_maximum(statistics);
3424  }
3425  break;
3426 
3427  case MeanStandardDeviation:
3428  {
3429  scale_data_mean_standard_deviation(statistics);
3430  }
3431  break;
3432 
3433  default:
3434  {
3435  std::ostringstream buffer;
3436 
3437  buffer << "OpenNN Exception: DataSet class\n"
3438  << "Vector< Statistics<double> > scale_data(const std::string&) method.\n"
3439  << "Unknown scaling and unscaling method.\n";
3440 
3441  throw std::logic_error(buffer.str());
3442  }
3443  break;
3444  }
3445 
3446  return(statistics);
3447 }
3448 */
3449 
3450 // void scale_inputs_mean_standard_deviation(const Vector< Statistics<double> >&) method
3451 
3456 
3458 {
3459  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
3460 
3461  data.scale_columns_mean_standard_deviation(inputs_statistics, inputs_indices);
3462 }
3463 
3464 
3465 // Vector< Statistics<double> > scale_inputs_mean_standard_deviation(void) method
3466 
3470 
3472 {
3473  // Control sentence (if debug)
3474 
3475  #ifdef __OPENNN_DEBUG__
3476 
3477  if(data.empty())
3478  {
3479  std::ostringstream buffer;
3480 
3481  buffer << "OpenNN Exception: DataSet class.\n"
3482  << "Vector< Statistics<double> > scale_inputs_mean_standard_deviation(void) method.\n"
3483  << "Data file is not loaded.\n";
3484 
3485  throw std::logic_error(buffer.str());
3486  }
3487 
3488  #endif
3489 
3490  const Vector< Statistics<double> > inputs_statistics = calculate_inputs_statistics();
3491 
3492  scale_inputs_mean_standard_deviation(inputs_statistics);
3493 
3494  return(inputs_statistics);
3495 }
3496 
3497 
3498 // void scale_inputs_minimum_maximum(const Vector< Statistics<double> >&) method
3499 
3504 
3506 {
3507  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
3508 
3509  data.scale_columns_minimum_maximum(inputs_statistics, inputs_indices);
3510 }
3511 
3512 
3513 // Vector< Statistics<double> > scale_inputs_minimum_maximum(void) method
3514 
3518 
3520 {
3521  // Control sentence (if debug)
3522 
3523  #ifdef __OPENNN_DEBUG__
3524 
3525  if(data.empty())
3526  {
3527  std::ostringstream buffer;
3528 
3529  buffer << "OpenNN Exception: DataSet class.\n"
3530  << "Vector< Statistics<double> > scale_inputs_minimum_maximum(void) method.\n"
3531  << "Data file is not loaded.\n";
3532 
3533  throw std::logic_error(buffer.str());
3534  }
3535 
3536  #endif
3537 
3538  const Vector< Statistics<double> > inputs_statistics = calculate_inputs_statistics();
3539 
3540  scale_inputs_minimum_maximum(inputs_statistics);
3541 
3542  return(inputs_statistics);
3543 }
3544 
3545 
3546 // Vector< Vector<double> > scale_inputs(const std::string&) method
3547 
3552 
3553 Vector< Statistics<double> > DataSet::scale_inputs(const std::string& scaling_unscaling_method)
3554 {
3555  switch(get_scaling_unscaling_method(scaling_unscaling_method))
3556  {
3557  case NoScaling:
3558  {
3559  return(calculate_inputs_statistics());
3560  }
3561  break;
3562 
3563  case MinimumMaximum:
3564  {
3565  return(scale_inputs_minimum_maximum());
3566  }
3567  break;
3568 
3569  case MeanStandardDeviation:
3570  {
3572  }
3573  break;
3574 
3575  default:
3576  {
3577  std::ostringstream buffer;
3578 
3579  buffer << "OpenNN Exception: DataSet class\n"
3580  << "Vector< Statistics<double> > scale_inputs(void) method.\n"
3581  << "Unknown scaling and unscaling method.\n";
3582 
3583  throw std::logic_error(buffer.str());
3584  }
3585  break;
3586  }
3587 }
3588 
3589 
3590 // void scale_inputs(const std::string&, const Vector< Statistics<double> >&) method
3591 
3595 
3596 void DataSet::scale_inputs(const std::string& scaling_unscaling_method, const Vector< Statistics<double> >& inputs_statistics)
3597 {
3598  switch(get_scaling_unscaling_method(scaling_unscaling_method))
3599  {
3600  case NoScaling:
3601  {
3602  // Do nothing
3603  }
3604  break;
3605 
3606  case MinimumMaximum:
3607  {
3608  scale_inputs_minimum_maximum(inputs_statistics);
3609  }
3610  break;
3611 
3612  case MeanStandardDeviation:
3613  {
3614  scale_inputs_mean_standard_deviation(inputs_statistics);
3615  }
3616  break;
3617 
3618  default:
3619  {
3620  std::ostringstream buffer;
3621 
3622  buffer << "OpenNN Exception: DataSet class\n"
3623  << "void scale_inputs(const std::string&, const Vector< Statistics<double> >&) method.\n"
3624  << "Unknown scaling and unscaling method.\n";
3625 
3626  throw std::logic_error(buffer.str());
3627  }
3628  break;
3629  }
3630 }
3631 
3632 
3633 // void scale_targets_mean_standard_deviation(const Vector< Statistics<double> >&)
3634 
3639 
3641 {
3642  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
3643 
3644  data.scale_columns_mean_standard_deviation(targets_statistics, targets_indices);
3645 }
3646 
3647 
3648 // Vector< Statistics<double> > scale_targets_mean_standard_deviation(void) method
3649 
3653 
3655 {
3656  // Control sentence (if debug)
3657 
3658  #ifdef __OPENNN_DEBUG__
3659 
3660  if(data.empty())
3661  {
3662  std::ostringstream buffer;
3663 
3664  buffer << "OpenNN Exception: DataSet class.\n"
3665  << "Vector< Statistics<double> > scale_targets_mean_standard_deviation(void) method.\n"
3666  << "Data file is not loaded.\n";
3667 
3668  throw std::logic_error(buffer.str());
3669  }
3670 
3671  #endif
3672 
3673  const Vector< Statistics<double> > targets_statistics = calculate_targets_statistics();
3674 
3675  scale_targets_mean_standard_deviation(targets_statistics);
3676 
3677  return(targets_statistics);
3678 }
3679 
3680 
3681 // void scale_targets_minimum_maximum(const Vector< Statistics<double> >&) method
3682 
3687 
3689 {
3690  // Control sentence (if debug)
3691 
3692  #ifdef __OPENNN_DEBUG__
3693 
3694  if(data.empty())
3695  {
3696  std::ostringstream buffer;
3697 
3698  buffer << "OpenNN Exception: DataSet class.\n"
3699  << "Vector< Statistics<double> > scale_targets_minimum_maximum(void) method.\n"
3700  << "Data file is not loaded.\n";
3701 
3702  throw std::logic_error(buffer.str());
3703  }
3704 
3705  #endif
3706 
3707  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
3708 
3709  data.scale_columns_minimum_maximum(targets_statistics, targets_indices);
3710 }
3711 
3712 
3713 // Vector< Statistics<double> > scale_targets_minimum_maximum(void) method
3714 
3718 
3720 {
3721  const Vector< Statistics<double> > targets_statistics = calculate_targets_statistics();
3722 
3723  scale_targets_minimum_maximum(targets_statistics);
3724 
3725  return(targets_statistics);
3726 }
3727 
3728 
3729 // Vector< Statistics<double> > scale_targets(const std::string&) method
3730 
3735 
3736 Vector< Statistics<double> > DataSet::scale_targets(const std::string& scaling_unscaling_method)
3737 {
3738  switch(get_scaling_unscaling_method(scaling_unscaling_method))
3739  {
3740 
3741  case NoUnscaling:
3742  {
3743  return(calculate_targets_statistics());
3744  }
3745  break;
3746  case MinimumMaximum:
3747  {
3749  }
3750  break;
3751 
3752  case MeanStandardDeviation:
3753  {
3755  }
3756  break;
3757 
3758  default:
3759  {
3760  std::ostringstream buffer;
3761 
3762  buffer << "OpenNN Exception: DataSet class\n"
3763  << "Vector< Statistics<double> > scale_targets(const std::string&) method.\n"
3764  << "Unknown scaling and unscaling method.\n";
3765 
3766  throw std::logic_error(buffer.str());
3767  }
3768  break;
3769  }
3770 }
3771 
3772 
3773 // void scale_targets(const std::string&, const Vector< Statistics<double> >&) method
3774 
3777 
3778 void DataSet::scale_targets(const std::string& scaling_unscaling_method, const Vector< Statistics<double> >& targets_statistics)
3779 {
3780  switch(get_scaling_unscaling_method(scaling_unscaling_method))
3781  {
3782  case NoUnscaling:
3783  {
3784  // Do nothing
3785  }
3786  break;
3787  case MinimumMaximum:
3788  {
3789  scale_targets_minimum_maximum(targets_statistics);
3790  }
3791  break;
3792 
3793  case MeanStandardDeviation:
3794  {
3795  scale_targets_mean_standard_deviation(targets_statistics);
3796  }
3797  break;
3798 
3799  default:
3800  {
3801  std::ostringstream buffer;
3802 
3803  buffer << "OpenNN Exception: DataSet class\n"
3804  << "void scale_targets(const std::string&, const Vector< Statistics<double> >&) method.\n"
3805  << "Unknown scaling and unscaling method.\n";
3806 
3807  throw std::logic_error(buffer.str());
3808  }
3809  break;
3810  }
3811 }
3812 
3813 
3814 // void unscale_data_mean_standard_deviation(const Vector< Statistics<double> >&) method
3815 
3820 
3822 {
3823  data.unscale_mean_standard_deviation(data_statistics);
3824 }
3825 
3826 
3827 // void unscale_data_minimum_maximum(const Vector< Statistics<double> >&) method
3828 
3833 
3835 {
3836  data.unscale_minimum_maximum(data_statistics);
3837 }
3838 
3839 
3840 // void unscale_inputs_mean_standard_deviation(const Vector< Statistics<double> >&) method
3841 
3846 
3848 {
3849  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
3850 
3851  data.unscale_columns_mean_standard_deviation(data_statistics, inputs_indices);
3852 }
3853 
3854 
3855 // void unscale_inputs_minimum_maximum(const Vector< Statistics<double> >&) method
3856 
3861 
3863 {
3864  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
3865 
3866  data.unscale_columns_minimum_maximum(data_statistics, inputs_indices);
3867 }
3868 
3869 
3870 // void unscale_targets_mean_standard_deviation(const Vector< Statistics<double> >&) method
3871 
3876 
3878 {
3879  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
3880 
3881  data.unscale_columns_mean_standard_deviation(data_statistics, targets_indices);
3882 }
3883 
3884 
3885 // void unscale_targets_minimum_maximum(const Vector< Statistics<double> >&) method
3886 
3891 
3893 {
3894  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
3895 
3896  data.unscale_columns_minimum_maximum(data_statistics, targets_indices);
3897 }
3898 
3899 
3900 // void initialize_data(const double& value) method
3901 
3904 
3905 void DataSet::initialize_data(const double& new_value)
3906 {
3907  data.initialize(new_value);
3908 }
3909 
3910 
3911 // void randomize_data_uniform(const double&, const double&) method
3912 
3915 
3916 void DataSet::randomize_data_uniform(const double& minimum, const double& maximum)
3917 {
3918  data.randomize_uniform(minimum, maximum);
3919 }
3920 
3921 
3922 // void randomize_data_normal(const double&, const double&) method
3923 
3926 
3927 void DataSet::randomize_data_normal(const double& mean, const double& standard_deviation)
3928 {
3929  data.randomize_normal(mean, standard_deviation);
3930 }
3931 
3932 
3933 // tinyxml2::XMLDocument* to_XML(void) const method
3934 
3936 
3937 tinyxml2::XMLDocument* DataSet::to_XML(void) const
3938 {
3939  tinyxml2::XMLDocument* document = new tinyxml2::XMLDocument;
3940 
3941  std::ostringstream buffer;
3942 
3943  // Data set
3944 
3945  tinyxml2::XMLElement* data_set_element = document->NewElement("DataSet");
3946  document->InsertFirstChild(data_set_element);
3947 
3948  tinyxml2::XMLElement* element = NULL;
3949  tinyxml2::XMLText* text = NULL;
3950 
3951  // Data file
3952 
3953  tinyxml2::XMLElement* data_file_element = document->NewElement("DataFile");
3954 
3955  data_set_element->InsertFirstChild(data_file_element);
3956 
3957  // File type
3958  {
3959  element = document->NewElement("FileType");
3960  data_file_element->LinkEndChild(element);
3961 
3962  text = document->NewText(write_file_type().c_str());
3963  element->LinkEndChild(text);
3964  }
3965 
3966  //First cell
3967  {
3968  element = document->NewElement("FirstCell");
3969  data_file_element->LinkEndChild(element);
3970 
3971  text = document->NewText(write_first_cell().c_str());
3972  element->LinkEndChild(text);
3973  }
3974 
3975  //Last cell
3976  {
3977  element = document->NewElement("LastCell");
3978  data_file_element->LinkEndChild(element);
3979 
3980  text = document->NewText(write_last_cell().c_str());
3981  element->LinkEndChild(text);
3982  }
3983 
3984  //Sheet number
3985  {
3986  element = document->NewElement("SheetNumber");
3987  data_file_element->LinkEndChild(element);
3988 
3989  buffer.str("");
3990  buffer << write_sheet_number();
3991 
3992  text = document->NewText(buffer.str().c_str());
3993  element->LinkEndChild(text);
3994  }
3995 
3996  // Lags Number
3997  {
3998  element = document->NewElement("LagsNumber");
3999  data_file_element->LinkEndChild(element);
4000 
4001  const size_t lags_number = get_lags_number();
4002 
4003  buffer.str("");
4004  buffer << lags_number;
4005 
4006  text = document->NewText(buffer.str().c_str());
4007  element->LinkEndChild(text);
4008  }
4009 
4010  // Steps Ahead
4011  {
4012  element = document->NewElement("StepsAhead");
4013  data_file_element->LinkEndChild(element);
4014 
4015  const size_t steps_ahead = get_steps_ahead();
4016 
4017  buffer.str("");
4018  buffer << steps_ahead;
4019 
4020  text = document->NewText(buffer.str().c_str());
4021  element->LinkEndChild(text);
4022  }
4023 
4024  // Header line
4025  {
4026  element = document->NewElement("ColumnsName");
4027  data_file_element->LinkEndChild(element);
4028 
4029  buffer.str("");
4030  buffer << header_line;
4031 
4032  text = document->NewText(buffer.str().c_str());
4033  element->LinkEndChild(text);
4034  }
4035 
4036  // Rows label
4037  {
4038  element = document->NewElement("RowsLabel");
4039  data_file_element->LinkEndChild(element);
4040 
4041  buffer.str("");
4042  buffer << rows_label;
4043 
4044  text = document->NewText(buffer.str().c_str());
4045  element->LinkEndChild(text);
4046  }
4047 
4048  // Separator
4049  {
4050  element = document->NewElement("Separator");
4051  data_file_element->LinkEndChild(element);
4052 
4053  text = document->NewText(write_separator().c_str());
4054  element->LinkEndChild(text);
4055  }
4056 
4057  // Missing values label
4058  {
4059  element = document->NewElement("MissingValuesLabel");
4060  data_file_element->LinkEndChild(element);
4061 
4062  text = document->NewText(missing_values_label.c_str());
4063  element->LinkEndChild(text);
4064  }
4065 
4066  // Data file name
4067  {
4068  element = document->NewElement("DataFileName");
4069  data_file_element->LinkEndChild(element);
4070 
4071  text = document->NewText(data_file_name.c_str());
4072  element->LinkEndChild(text);
4073  }
4074 
4075  // Variables
4076  {
4077  element = document->NewElement("Variables");
4078  data_set_element->LinkEndChild(element);
4079 
4080  const tinyxml2::XMLDocument* variables_document = variables.to_XML();
4081 
4082  const tinyxml2::XMLElement* variables_element = variables_document->FirstChildElement("Variables");
4083 
4084  DeepClone(element, variables_element, document, NULL);
4085 
4086  delete variables_document;
4087  }
4088 
4089  // Instances
4090  {
4091  element = document->NewElement("Instances");
4092  data_set_element->LinkEndChild(element);
4093 
4094  const tinyxml2::XMLDocument* instances_document = instances.to_XML();
4095 
4096  const tinyxml2::XMLElement* instances_element = instances_document->FirstChildElement("Instances");
4097 
4098  DeepClone(element, instances_element, document, NULL);
4099 
4100  delete instances_document;
4101  }
4102 
4103  // Missing values
4104  {
4105  element = document->NewElement("MissingValues");
4106  data_set_element->LinkEndChild(element);
4107 
4108  const tinyxml2::XMLDocument* missing_values_document = missing_values.to_XML();
4109 
4110  const tinyxml2::XMLElement* missing_values_element = missing_values_document->FirstChildElement("MissingValues");
4111 
4112  DeepClone(element, missing_values_element, document, NULL);
4113 
4114  delete missing_values_document;
4115  }
4116 
4117  // Display
4118 // {
4119 // element = document->NewElement("Display");
4120 // data_set_element->LinkEndChild(element);
4121 
4122 // buffer.str("");
4123 // buffer << display;
4124 
4125 // text = document->NewText(buffer.str().c_str());
4126 // element->LinkEndChild(text);
4127 // }
4128 
4129  return(document);
4130 }
4131 
4132 
4133 // void write_XML(tinyxml2::XMLPrinter&) const method
4134 
4136 
4137 void DataSet::write_XML(tinyxml2::XMLPrinter& file_stream) const
4138 {
4139  std::ostringstream buffer;
4140 
4141  file_stream.OpenElement("DataSet");
4142 
4143  // Data file
4144 
4145  file_stream.OpenElement("DataFile");
4146 
4147  // File type
4148 
4149  {
4150  file_stream.OpenElement("FileType");
4151 
4152  file_stream.PushText(write_file_type().c_str());
4153 
4154  file_stream.CloseElement();
4155  }
4156 
4157  // First cell
4158 
4159  {
4160  file_stream.OpenElement("FirstCell");
4161 
4162  file_stream.PushText(write_first_cell().c_str());
4163 
4164  file_stream.CloseElement();
4165  }
4166 
4167  // Last cell
4168 
4169  {
4170  file_stream.OpenElement("LastCell");
4171 
4172  file_stream.PushText(write_last_cell().c_str());
4173 
4174  file_stream.CloseElement();
4175  }
4176 
4177  // Sheet number
4178 
4179  {
4180  file_stream.OpenElement("SheetNumber");
4181 
4182  buffer.str("");
4183  buffer << write_sheet_number();
4184 
4185  file_stream.PushText(buffer.str().c_str());
4186 
4187  file_stream.CloseElement();
4188  }
4189 
4190  // Lags number
4191 
4192  {
4193  file_stream.OpenElement("LagsNumber");
4194 
4195  buffer.str("");
4196  buffer << get_lags_number();
4197 
4198  file_stream.PushText(buffer.str().c_str());
4199 
4200  file_stream.CloseElement();
4201  }
4202 
4203  // Steps Ahead
4204 
4205  {
4206  file_stream.OpenElement("StepsAhead");
4207 
4208  buffer.str("");
4209  buffer << get_steps_ahead();
4210 
4211  file_stream.PushText(buffer.str().c_str());
4212 
4213  file_stream.CloseElement();
4214  }
4215 
4216  // Header line
4217 
4218  {
4219  file_stream.OpenElement("ColumnsName");
4220 
4221  buffer.str("");
4222  buffer << header_line;
4223 
4224  file_stream.PushText(buffer.str().c_str());
4225 
4226  file_stream.CloseElement();
4227  }
4228 
4229  // Rows label
4230 
4231  {
4232  file_stream.OpenElement("RowsLabel");
4233 
4234  buffer.str("");
4235  buffer << rows_label;
4236 
4237  file_stream.PushText(buffer.str().c_str());
4238 
4239  file_stream.CloseElement();
4240  }
4241 
4242  // Separator
4243 
4244  {
4245  file_stream.OpenElement("Separator");
4246 
4247  file_stream.PushText(write_separator().c_str());
4248 
4249  file_stream.CloseElement();
4250  }
4251 
4252  // Missing values label
4253 
4254  {
4255  file_stream.OpenElement("MissingValuesLabel");
4256 
4257  file_stream.PushText(missing_values_label.c_str());
4258 
4259  file_stream.CloseElement();
4260  }
4261 
4262  // Data file name
4263 
4264  {
4265  file_stream.OpenElement("DataFileName");
4266 
4267  file_stream.PushText(data_file_name.c_str());
4268 
4269  file_stream.CloseElement();
4270  }
4271 
4272  file_stream.CloseElement();
4273 
4274  // Variables
4275 
4276  variables.write_XML(file_stream);
4277 
4278  // Instances
4279 
4280  instances.write_XML(file_stream);
4281 
4282  // Missing values
4283 
4284  missing_values.write_XML(file_stream);
4285 
4286 
4287  file_stream.CloseElement();
4288 }
4289 
4290 
4291 // void from_XML(const tinyxml2::XMLDocument&) method
4292 
4295 
4296 void DataSet::from_XML(const tinyxml2::XMLDocument& data_set_document)
4297 {
4298  std::ostringstream buffer;
4299 
4300  // Data set element
4301 
4302  const tinyxml2::XMLElement* data_set_element = data_set_document.FirstChildElement("DataSet");
4303 
4304  if(!data_set_element)
4305  {
4306  buffer << "OpenNN Exception: DataSet class.\n"
4307  << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
4308  << "Data set element is NULL.\n";
4309 
4310  throw std::logic_error(buffer.str());
4311  }
4312 
4313  // Data file
4314 
4315  const tinyxml2::XMLElement* data_file_element = data_set_element->FirstChildElement("DataFile");
4316 
4317  if(!data_file_element)
4318  {
4319  buffer << "OpenNN Exception: DataSet class.\n"
4320  << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
4321  << "Data file element is NULL.\n";
4322 
4323  throw std::logic_error(buffer.str());
4324  }
4325 
4326  // Data file name
4327  {
4328  const tinyxml2::XMLElement* data_file_name_element = data_file_element->FirstChildElement("DataFileName");
4329 
4330  if(!data_file_name_element)
4331  {
4332  buffer << "OpenNN Exception: DataSet class.\n"
4333  << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
4334  << "Data file name element is NULL.\n";
4335 
4336  throw std::logic_error(buffer.str());
4337  }
4338 
4339  if(data_file_name_element->GetText())
4340  {
4341  const std::string new_data_file_name = data_file_name_element->GetText();
4342 
4343  set_data_file_name(new_data_file_name);
4344  }
4345  }
4346 
4347  // Lags number
4348  {
4349  const tinyxml2::XMLElement* lags_number_element = data_file_element->FirstChildElement("LagsNumber");
4350 
4351  if(!lags_number_element)
4352  {
4353  buffer << "OpenNN Exception: DataSet class.\n"
4354  << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
4355  << "Lags number element is NULL.\n";
4356 
4357  throw std::logic_error(buffer.str());
4358  }
4359 
4360  if(lags_number_element->GetText())
4361  {
4362  const size_t new_lags_number = atoi(lags_number_element->GetText());
4363 
4364  set_lags_number(new_lags_number);
4365  }
4366  }
4367 
4368  // Steps ahead
4369  {
4370  const tinyxml2::XMLElement* steps_ahead_element = data_file_element->FirstChildElement("StepsAhead");
4371 
4372  if(!steps_ahead_element)
4373  {
4374  buffer << "OpenNN Exception: DataSet class.\n"
4375  << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
4376  << "Steps ahead element is NULL.\n";
4377 
4378  throw std::logic_error(buffer.str());
4379  }
4380 
4381  if(steps_ahead_element->GetText())
4382  {
4383  const size_t new_steps_ahead = atoi(steps_ahead_element->GetText());
4384 
4385  set_steps_ahead_number(new_steps_ahead);
4386  }
4387  }
4388 
4389  // File Type
4390  {
4391  const tinyxml2::XMLElement* file_type_element = data_file_element->FirstChildElement("FileType");
4392 
4393  if(!file_type_element)
4394  {
4395  buffer << "OpenNN Exception: DataSet class.\n"
4396  << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
4397  << "File type element is NULL.\n";
4398 
4399  throw std::logic_error(buffer.str());
4400  }
4401 
4402  if(file_type_element->GetText())
4403  {
4404  const std::string new_file_type = file_type_element->GetText();
4405 
4406  set_file_type(new_file_type);
4407  }
4408  }
4409 
4410  // First Cell
4411  {
4412  const tinyxml2::XMLElement* first_cell_element = data_file_element->FirstChildElement("FirstCell");
4413 
4414  if(!first_cell_element)
4415  {
4416  buffer << "OpenNN Exception: DataSet class.\n"
4417  << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
4418  << "First Cell element is NULL.\n";
4419 
4420  throw std::logic_error(buffer.str());
4421  }
4422 
4423  if(first_cell_element->GetText())
4424  {
4425  const std::string new_first_cell = first_cell_element->GetText();
4426 
4427  first_cell = new_first_cell;
4428  }
4429  }
4430 
4431  // Last Cell
4432  {
4433  const tinyxml2::XMLElement* last_cell_element = data_file_element->FirstChildElement("LastCell");
4434 
4435  if(!last_cell_element)
4436  {
4437  buffer << "OpenNN Exception: DataSet class.\n"
4438  << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
4439  << "Last Cell element is NULL.\n";
4440 
4441  throw std::logic_error(buffer.str());
4442  }
4443 
4444  if(last_cell_element->GetText())
4445  {
4446  const std::string new_last_cell = last_cell_element->GetText();
4447 
4448  last_cell = new_last_cell;
4449  }
4450  }
4451 
4452  // Sheet number
4453  {
4454  const tinyxml2::XMLElement* sheet_number_element = data_file_element->FirstChildElement("SheetNumber");
4455 
4456  if(!sheet_number_element)
4457  {
4458  buffer << "OpenNN Exception: DataSet class.\n"
4459  << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
4460  << "Sheet Number element is NULL.\n";
4461 
4462  throw std::logic_error(buffer.str());
4463  }
4464 
4465  if(sheet_number_element->GetText())
4466  {
4467  const size_t new_sheet_number = atoi(sheet_number_element->GetText());
4468 
4469  sheet_number = new_sheet_number;
4470  }
4471  }
4472 
4473  // Header line
4474  {
4475  const tinyxml2::XMLElement* header_element = data_file_element->FirstChildElement("ColumnsName");
4476 
4477  if(header_element)
4478  {
4479  const std::string new_header_string = header_element->GetText();
4480 
4481  try
4482  {
4483  set_header_line(new_header_string != "0");
4484  }
4485  catch(const std::logic_error& e)
4486  {
4487  std::cout << e.what() << std::endl;
4488  }
4489  }
4490  }
4491 
4492  // Rows label
4493  {
4494  const tinyxml2::XMLElement* rows_label_element = data_file_element->FirstChildElement("RowsLabel");
4495 
4496  if(rows_label_element)
4497  {
4498  const std::string new_rows_label_string = rows_label_element->GetText();
4499 
4500  try
4501  {
4502  set_rows_label(new_rows_label_string != "0");
4503  }
4504  catch(const std::logic_error& e)
4505  {
4506  std::cout << e.what() << std::endl;
4507  }
4508  }
4509  }
4510 
4511  // Separator
4512  {
4513  const tinyxml2::XMLElement* separator_element = data_file_element->FirstChildElement("Separator");
4514 
4515  if(separator_element)
4516  {
4517  if(separator_element->GetText())
4518  {
4519  const std::string new_separator = separator_element->GetText();
4520 
4521  set_separator(new_separator);
4522  }
4523  else
4524  {
4525  set_separator("Space");
4526  }
4527  }
4528  else
4529  {
4530  set_separator("Space");
4531  }
4532  }
4533 
4534  // Missing values label
4535  {
4536  const tinyxml2::XMLElement* missing_values_label_element = data_file_element->FirstChildElement("MissingValuesLabel");
4537 
4538  if(missing_values_label_element)
4539  {
4540  if(missing_values_label_element->GetText())
4541  {
4542  const std::string new_missing_values_label = missing_values_label_element->GetText();
4543 
4544  set_missing_values_label(new_missing_values_label);
4545  }
4546  }
4547  }
4548 
4549  // Variables
4550  {
4551  const tinyxml2::XMLElement* variables_element = data_set_element->FirstChildElement("Variables");
4552 
4553  if(variables_element)
4554  {
4555  tinyxml2::XMLDocument variables_document;
4556 
4557  tinyxml2::XMLElement* variables_element_clone = variables_document.NewElement("Variables");
4558  variables_document.InsertFirstChild(variables_element_clone);
4559 
4560  DeepClone(variables_element_clone, variables_element, &variables_document, NULL);
4561 
4562  variables.from_XML(variables_document);
4563  }
4564  }
4565 
4566  // Instances
4567  {
4568  const tinyxml2::XMLElement* instances_element = data_set_element->FirstChildElement("Instances");
4569 
4570  if(instances_element)
4571  {
4572  tinyxml2::XMLDocument instances_document;
4573 
4574  tinyxml2::XMLElement* instances_element_clone = instances_document.NewElement("Instances");
4575  instances_document.InsertFirstChild(instances_element_clone);
4576 
4577  DeepClone(instances_element_clone, instances_element, &instances_document, NULL);
4578 
4579  instances.from_XML(instances_document);
4580  }
4581  }
4582 
4583  // Missing values
4584  {
4585  const tinyxml2::XMLElement* missing_values_element = data_set_element->FirstChildElement("MissingValues");
4586 
4587  if(missing_values_element)
4588  {
4589  tinyxml2::XMLDocument missing_values_document;
4590 
4591  tinyxml2::XMLElement* missing_values_element_clone = missing_values_document.NewElement("MissingValues");
4592  missing_values_document.InsertFirstChild(missing_values_element_clone);
4593 
4594  DeepClone(missing_values_element_clone, missing_values_element, &missing_values_document, NULL);
4595 
4596  missing_values.from_XML(missing_values_document);
4597  }
4598  }
4599 
4600  // Display
4601  {
4602  const tinyxml2::XMLElement* display_element = data_set_element->FirstChildElement("Display");
4603 
4604  if(display_element)
4605  {
4606  const std::string new_display_string = display_element->GetText();
4607 
4608  try
4609  {
4610  set_display(new_display_string != "0");
4611  }
4612  catch(const std::logic_error& e)
4613  {
4614  std::cout << e.what() << std::endl;
4615  }
4616  }
4617  }
4618 
4619 }
4620 
4621 
4622 // std::string to_string(void) const method
4623 
4625 
4626 std::string DataSet::to_string(void) const
4627 {
4628  std::ostringstream buffer;
4629 
4630  buffer << "Data set object\n"
4631  << "Data file name: " << data_file_name << "\n"
4632  << "Header line: " << header_line << "\n"
4633  << "Separator: " << separator << "\n"
4634  << "Missing values label: " << missing_values_label << "\n"
4635  << "Data:\n" << data << "\n"
4636  << "Display: " << display << "\n"
4637  << variables.to_string()
4638  << instances.to_string()
4640 
4641  return(buffer.str());
4642 }
4643 
4644 
4645 // void print(void) const method
4646 
4648 
4649 void DataSet::print(void) const
4650 {
4651  if(display)
4652  {
4653  std::cout << to_string();
4654  }
4655 }
4656 
4657 
4658 // void print_summary(void) const method
4659 
4661 
4662 void DataSet::print_summary(void) const
4663 {
4664  if(display)
4665  {
4666  const size_t variables_number = variables.get_variables_number();
4667  const size_t instances_number = instances.get_instances_number();
4668  const size_t missing_values_number = missing_values.get_missing_values_number();
4669 
4670  std::cout << "Data set object summary:\n"
4671  << "Number of variables: " << variables_number << "\n"
4672  << "Number of instances: " << instances_number << "\n"
4673  << "Number of missing values: " << missing_values_number << std::endl;
4674  }
4675 }
4676 
4677 
4678 // void save(const std::string&) const method
4679 
4682 
4683 void DataSet::save(const std::string& file_name) const
4684 {
4685  tinyxml2::XMLDocument* document = to_XML();
4686 
4687  document->SaveFile(file_name.c_str());
4688 
4689  delete document;
4690 }
4691 
4692 
4693 // void load(const std::string&) method
4694 
4717 
4718 void DataSet::load(const std::string& file_name)
4719 {
4720  tinyxml2::XMLDocument document;
4721 
4722  if(document.LoadFile(file_name.c_str()))
4723  {
4724  std::ostringstream buffer;
4725 
4726  buffer << "OpenNN Exception: DataSet class.\n"
4727  << "void load(const std::string&) method.\n"
4728  << "Cannot load XML file " << file_name << ".\n";
4729 
4730  throw std::logic_error(buffer.str());
4731  }
4732 
4733  from_XML(document);
4734 }
4735 
4736 
4737 // void print_data(void) const method
4738 
4740 
4741 void DataSet::print_data(void) const
4742 {
4743  if(display)
4744  {
4745  std::cout << data << std::endl;
4746  }
4747 }
4748 
4749 
4750 // void print_data_preview(void) const method
4751 
4754 
4756 {
4757  if(display)
4758  {
4759  const size_t instances_number = instances.get_instances_number();
4760 
4761  if(instances_number > 0)
4762  {
4763  const Vector<double> first_instance = data.arrange_row(0);
4764 
4765  std::cout << "First instance:\n"
4766  << first_instance << std::endl;
4767  }
4768 
4769  if(instances_number > 1)
4770  {
4771  const Vector<double> second_instance = data.arrange_row(1);
4772 
4773  std::cout << "Second instance:\n"
4774  << second_instance << std::endl;
4775  }
4776 
4777  if(instances_number > 2)
4778  {
4779  const Vector<double> last_instance = data.arrange_row(instances_number-1);
4780 
4781  std::cout << "Instance " << instances_number << ":\n"
4782  << last_instance << std::endl;
4783  }
4784  }
4785 }
4786 
4787 
4788 // void save_data(void) const method
4789 
4791 
4792 void DataSet::save_data(void) const
4793 {
4794  std::ofstream file(data_file_name.c_str());
4795 
4796  if(!file.is_open())
4797  {
4798  std::ostringstream buffer;
4799 
4800  buffer << "OpenNN Exception: DataSet class.\n"
4801  << "void save_data(void) const method.\n"
4802  << "Cannot open data file.\n";
4803 
4804  throw std::logic_error(buffer.str());
4805  }
4806 
4807  const std::string separator_string = get_separator_string();
4808 
4809  if(header_line)
4810  {
4811  const Vector<std::string> variables_name = variables.arrange_names();
4812 
4813  file << variables_name.to_string(separator_string) << std::endl;
4814  }
4815 
4816  // Write data
4817 
4818  const size_t rows_number = data.get_rows_number();
4819  const size_t columns_number = data.get_columns_number();
4820 
4821  for(size_t i = 0; i < rows_number; i++)
4822  {
4823  for(size_t j = 0; j < columns_number; j++)
4824  {
4825  file << data(i,j);
4826 
4827  if(j != columns_number-1)
4828  {
4829  file << separator_string;
4830  }
4831  }
4832 
4833  file << std::endl;
4834  }
4835 
4836  // Close file
4837 
4838  file.close();
4839 }
4840 
4841 
4842 // size_t get_column_index(const Vector< Vector<std::string> >&, const size_t) const method
4843 
4847 
4848 size_t DataSet::get_column_index(const Vector< Vector<std::string> >& nominal_labels, const size_t column_index) const
4849 {
4850  size_t variable_index = 0;
4851 
4852  for(size_t i = 0; i < column_index; i++)
4853  {
4854  if(nominal_labels[i].size() <= 2)
4855  {
4856  variable_index++;
4857  }
4858  else
4859  {
4860  variable_index += nominal_labels[i].size();
4861  }
4862  }
4863 
4864  return variable_index;
4865 }
4866 
4867 
4868 // void check_separator(const std::string&) method
4869 
4873 
4874 void DataSet::check_separator(const std::string& line) const
4875 {
4876  if(line.empty())
4877  {
4878  return;
4879  }
4880 
4881  const std::string separator_string = get_separator_string();
4882 
4883  if(line.find(separator_string) == std::string::npos)
4884  {
4885  std::ostringstream buffer;
4886 
4887  buffer << "OpenNN Exception: DataSet class.\n"
4888  << "void check_separator(const std::string&) method.\n"
4889  << "Separator '" << write_separator() << "' not found in data file " << data_file_name << ".\n";
4890 
4891  throw std::logic_error(buffer.str());
4892  }
4893 }
4894 
4895 
4896 // size_t count_data_file_columns_number(void) const method
4897 
4900 
4902 {
4903  std::ifstream file(data_file_name.c_str());
4904 
4905  std::string line;
4906 
4907  size_t columns_number = 0;
4908 
4909  while(file.good())
4910  {
4911  getline(file, line);
4912 
4913  if(separator != Tab)
4914  {
4915  std::replace(line.begin(), line.end(), '\t', ' ');
4916  }
4917 
4918  trim(line);
4919 
4920  if(line.empty())
4921  {
4922  continue;
4923  }
4924 
4925  check_separator(line);
4926 
4927  columns_number = count_tokens(line);
4928 
4929  break;
4930  }
4931 
4932  file.close();
4933 
4934  return columns_number;
4935 
4936 }
4937 
4938 
4939 // void check_header_line(void) method
4940 
4945 
4947 {
4948  std::ifstream file(data_file_name.c_str());
4949 
4950  std::string line;
4951  Vector<std::string> tokens;
4952 
4953  while(file.good())
4954  {
4955  getline(file, line);
4956 
4957  if(separator != Tab)
4958  {
4959  std::replace(line.begin(), line.end(), '\t', ' ');
4960  }
4961 
4962  trim(line);
4963 
4964  if(line.empty())
4965  {
4966  continue;
4967  }
4968 
4969  break;
4970  }
4971 
4972  file.close();
4973 
4974  check_separator(line);
4975 
4976  tokens = get_tokens(line);
4977 
4978  if(header_line && is_not_numeric(tokens))
4979  {
4980  return;
4981  }
4982  if(header_line && is_numeric(tokens))
4983  {
4984  if(display)
4985  {
4986  std::cout << "OpenNN Warning: DataSet class.\n"
4987  << "void check_header_line(void) method.\n"
4988  << "First line of data file interpreted as not header.\n";
4989  }
4990 
4991  header_line = false;
4992  }
4993  else if(header_line && is_mixed(tokens))
4994  {
4995  std::ostringstream buffer;
4996 
4997  buffer << "OpenNN Exception: DataSet class.\n"
4998  << "void check_header_line(void) method.\n"
4999  << "Header line contains numeric values: \n"
5000  << line << "\n";
5001 
5002  throw std::logic_error(buffer.str());
5003  }
5004  else if(!header_line && is_not_numeric(tokens))
5005  {
5006  if(display)
5007  {
5008  std::cout << "OpenNN Warning: DataSet class.\n"
5009  << "void check_header_line(void) method.\n"
5010  << "First line of data file interpreted as header.\n";
5011  }
5012 
5013  header_line = true;
5014  }
5015 }
5016 
5017 
5018 // Vector<std::string> read_header_line(void) const method
5019 
5021 
5023 {
5025 
5026  std::string line;
5027 
5028  std::ifstream file(data_file_name.c_str());
5029 
5030  // First line
5031 
5032  while(file.good())
5033  {
5034  getline(file, line);
5035 
5036  if(separator != Tab)
5037  {
5038  std::replace(line.begin(), line.end(), '\t', ' ');
5039  }
5040 
5041  trim(line);
5042 
5043  if(line.empty())
5044  {
5045  continue;
5046  }
5047 
5048  check_separator(line);
5049 
5050  header_line = get_tokens(line);
5051 
5052  break;
5053  }
5054 
5055  file.close();
5056 
5057  return header_line;
5058 }
5059 
5060 
5061 // void read_instance(const std::string&, const Vector< Vector<std::string> >&, const size_t&) method
5062 
5067 
5068 void DataSet::read_instance(const std::string& line, const Vector< Vector<std::string> >& nominal_labels, const size_t& instance_index)
5069 {
5070  // Control sentence (if debug)
5071 
5072  #ifdef __OPENNN_DEBUG__
5073 
5074  const size_t instances_number = instances.get_instances_number();
5075 
5076  if(instance_index >= instances_number)
5077  {
5078  std::ostringstream buffer;
5079 
5080  buffer << "OpenNN Exception: DataSet class.\n"
5081  << "void read_instance(const std::string&, const Vector< Vector<std::string> >&, const size_t&) method.\n"
5082  << "Index of instance (" << instance_index << ") must be less than number of instances (" << instances_number << ").\n";
5083 
5084  throw std::logic_error(buffer.str());
5085  }
5086 
5087  #endif
5088 
5089  const Vector<std::string> tokens = get_tokens(line);
5090 
5091  #ifdef __OPENNN_DEBUG__
5092 
5093  if(tokens.size() != nominal_labels.size())
5094  {
5095  std::ostringstream buffer;
5096 
5097  buffer << "OpenNN Exception: DataSet class.\n"
5098  << "void read_instance(const std::string&, const Vector< Vector<std::string> >&, const size_t&) method.\n"
5099  << "Size of tokens (" << tokens.size() << ") must be equal to size of names (" << nominal_labels.size() << ").\n";
5100 
5101  throw std::logic_error(buffer.str());
5102  }
5103 
5104  #endif
5105 
5106  size_t column_index;
5107 
5108  for(size_t j = 0; j < tokens.size(); j++)
5109  {
5110  column_index = get_column_index(nominal_labels, j);
5111 
5112  if(nominal_labels[j].size() == 0) // Numeric variable
5113  {
5114  if(tokens[j] != missing_values_label) // No missing values
5115  {
5116  data(instance_index, column_index) = atof(tokens[j].c_str());
5117  }
5118  else // Missing values
5119  {
5120  data(instance_index, column_index) = -99.9;
5121 
5122  missing_values.append(instance_index, column_index);
5123  }
5124  }
5125 
5126  else if(nominal_labels[j].size() == 2) // Binary variable
5127  {
5128  if(tokens[j] != missing_values_label) // No missing values
5129  {
5130  if(tokens[j] == "false" || tokens[j] == "False"|| tokens[j] == "FALSE" || tokens[j] == "F"
5131  || tokens[j] == "negative"|| tokens[j] == "Negative"|| tokens[j] == "NEGATIVE")
5132  {
5133  data(instance_index, column_index) = 0.0;
5134  }
5135  else if(tokens[j] == "true" || tokens[j] == "True"|| tokens[j] == "TRUE" || tokens[j] == "T"
5136  || tokens[j] == "positive"|| tokens[j] == "Positive"|| tokens[j] == "POSITIVE")
5137  {
5138  data(instance_index, column_index) = 1.0;
5139  }
5140  else if(tokens[j] == nominal_labels[j][0])
5141  {
5142  data(instance_index, column_index) = 0.0;
5143  }
5144  else if(tokens[j] == nominal_labels[j][1])
5145  {
5146  data(instance_index, column_index) = 1.0;
5147  }
5148  else
5149  {
5150  std::ostringstream buffer;
5151 
5152  buffer << "OpenNN Exception: DataSet class.\n"
5153  << "void read_instance(const std::string&, const Vector< Vector<std::string> >&, const size_t&) method.\n"
5154  << "Unknown token binary value.\n";
5155 
5156  throw std::logic_error(buffer.str());
5157  }
5158  }
5159  else // Missing values
5160  {
5161  data(instance_index, column_index) = -99.9;
5162 
5163  missing_values.append(instance_index, column_index);
5164  }
5165  }
5166 
5167  else // Nominal variable
5168  {
5169  if(tokens[j] != missing_values_label)
5170  {
5171  for(size_t k = 0; k < nominal_labels[j].size(); k++)
5172  {
5173  if(tokens[j] == nominal_labels[j][k])
5174  {
5175  data(instance_index, column_index+k) = 1.0;
5176  }
5177  else
5178  {
5179  data(instance_index, column_index+k) = 0.0;
5180  }
5181  }
5182  }
5183  else // Missing values
5184  {
5185  for(size_t k = 0; k < nominal_labels[j].size(); k++)
5186  {
5187  data(instance_index, column_index+k) = -99.9;
5188 
5189  missing_values.append(instance_index, column_index+k);
5190  }
5191  }
5192  }
5193  }
5194 }
5195 
5196 
5197 // Vector< Vector<std::string> > set_from_data_file(void) method
5198 
5201 
5203 {
5204  const size_t columns_number = count_data_file_columns_number();
5205 
5206  Vector< Vector<std::string> > nominal_labels(columns_number);
5207 
5208  std::string line;
5209  Vector<std::string> tokens;
5210 
5211 // bool numeric;
5212 
5214 
5215  int instances_count;
5216 
5217  if(header_line)
5218  {
5219  instances_count = -1;
5220  }
5221  else
5222  {
5223  instances_count = 0;
5224  }
5225 
5226  std::ifstream file(data_file_name.c_str());
5227 
5228  // Rest of lines
5229 
5230  while(file.good())
5231  {
5232  getline(file, line);
5233 
5234  if(separator != Tab)
5235  {
5236  std::replace(line.begin(), line.end(), '\t', ' ');
5237  }
5238 
5239  trim(line);
5240 
5241  if(line.empty())
5242  {
5243  continue;
5244  }
5245 
5246  if(header_line && instances_count == -1)
5247  {
5248  instances_count = 0;
5249 
5250  continue;
5251  }
5252 
5253  check_separator(line);
5254 
5255  tokens = get_tokens(line);
5256 // caixa
5257  /* if(tokens.size() != columns_number)
5258  {
5259  std::ostringstream buffer;
5260 
5261  buffer << "OpenNN Exception: DataSet class.\n"
5262  << "Vector< Vector<std::string> > DataSet::set_from_data_file(void).\n"
5263  << "Row " << instances_count << ": Size of tokens (" << tokens.size() << ") is not equal to "
5264  << "number of columns (" << columns_number << ").\n";
5265 
5266  throw std::logic_error(buffer.str());
5267  }
5268 */
5269  instances_count++;
5270 
5271  /*
5272  if(tokens.size() == columns_number)
5273  {
5274  instances_count++;
5275  for(size_t j = 0; j < columns_number; j++)
5276  {
5277  numeric = is_numeric(tokens[j]);
5278 
5279  if(!numeric
5280  && tokens[j] != missing_values_label
5281  && !nominal_labels[j].contains(tokens[j]))
5282  {
5283  nominal_labels[j].push_back(tokens[j]);
5284  }
5285  }
5286  }*/
5287 
5288 
5289  }
5290 
5291  file.close();
5292 
5293  size_t variables_count = 0;
5294 
5295  for(size_t i = 0; i < columns_number; i++)
5296  {
5297  if(nominal_labels[i].size() == 0 || nominal_labels[i].size() == 2)
5298  {
5299  variables_count++;
5300  }
5301  else
5302  {
5303  variables_count += nominal_labels[i].size();
5304  }
5305  }
5306 
5307  // Fix label case
5308 
5309  for(size_t i = 0; i < columns_number; i++)
5310  {
5311  if(nominal_labels[i].size() == (size_t)instances_count)
5312  {
5313  std::ostringstream buffer;
5314 
5315  buffer << "OpenNN Exception: DataSet class.\n"
5316  << "Vector< Vector<std::string> > DataSet::set_from_data_file(void).\n"
5317  << "Column " << i << ": All elements are nominal and different. It contains meaningless data.\n";
5318 
5319  throw std::logic_error(buffer.str());
5320  }
5321  }
5322 
5323  // Set instances and variables number
5324 
5325  if(instances_count == 0 || variables_count == 0)
5326  {
5327  set();
5328 
5329  return(nominal_labels);
5330  }
5331 
5332  data.set(instances_count, variables_count);
5333 
5334  if(variables.get_variables_number() != variables_count)
5335  {
5336  variables.set(variables_count);
5337 
5338  if(nominal_labels[columns_number-1].size() > 2)
5339  {
5340  for(size_t i = variables_count-1; i >= variables_count - nominal_labels[columns_number-1].size(); i--)
5341  {
5342  variables.set_use(i, Variables::Target);
5343  }
5344  }
5345  }
5346 
5347  if(instances.get_instances_number() != (size_t)instances_count)
5348  {
5349  instances.set(instances_count);
5350  }
5351 
5353 
5354  return(nominal_labels);
5355 }
5356 
5357 
5358 // void read_from_data_file(Vector< Vector<std::string> >&) method
5359 
5361 
5363 {
5364  std::ifstream file(data_file_name.c_str());
5365 
5366  file.clear();
5367  file.seekg(0, std::ios::beg);
5368 
5369  std::string line;
5370 
5371  if(header_line)
5372  {
5373  while(file.good())
5374  {
5375  getline(file, line);
5376 
5377  if(separator != Tab)
5378  {
5379  std::replace(line.begin(), line.end(), '\t', ' ');
5380  }
5381 
5382  trim(line);
5383 
5384  if(line.empty())
5385  {
5386  continue;
5387  }
5388 
5389  break;
5390  }
5391  }
5392 
5393  size_t i = 0;
5394 
5395 // #pragma omp parallel for private(i, line)
5396 
5397  while(file.good())
5398  {
5399  getline(file, line);
5400 
5401  if(separator != Tab)
5402  {
5403  std::replace(line.begin(), line.end(), '\t', ' ');
5404  }
5405 
5406  const Vector<std::string> tokens = get_tokens(line);
5407 
5408  trim(line);
5409 
5410  if(line.empty())
5411  {
5412  continue;
5413  }
5414 
5415  // #pragma omp task
5416 /*
5417  if(tokens.size() == nominal_labels.size())
5418  {
5419  read_instance(line, nominal_labels, i);
5420  i++;
5421  }*/
5422 // caixa
5423  read_instance(line, nominal_labels, i);
5424  i++;
5425 
5426  }
5427 
5428  file.close();
5429 }
5430 
5431 
5432 // Vector<std::string> arrange_time_series_prediction_names(const Vector<std::string>&) const method
5433 
5436 
5438 {
5439  Vector<std::string> time_series_prediction_names;
5440 /*
5441  Vector< Vector<std::string> > new_names((1+columns_number)*lags_number);
5442 
5443  for(size_t i = 0; i < 1+lags_number; i++)
5444  {
5445  for(size_t j = 0; j < names.size(); j++)
5446  {
5447  new_names[i+j] = names[j];
5448 
5449  if(i != lags_number)
5450  {
5451  for(size_t k = 0; k < names[j].size();k++)
5452  {
5453  new_names[i+j][k].append("_lag_").append(std::string::from_size_t(lags_number-i).c_str());
5454  }
5455  }
5456  }
5457  }
5458 */
5459  return(time_series_prediction_names);
5460 }
5461 
5462 
5463 // Vector<std::string> DataSet::arrange_association_names(const Vector<std::string>& names) const method
5464 
5467 
5469 {
5470  Vector<std::string> association_names;
5471 
5472  return(association_names);
5473 }
5474 
5475 
5476 // void convert_time_series(void) method
5477 
5480 
5482 {
5483  if(lags_number == 0)
5484  {
5485  return;
5486  }
5487 
5489 
5491 
5493 
5495 }
5496 
5497 
5498 // void convert_association(void) method
5499 
5502 
5504 {
5506 
5508 
5510 }
5511 
5512 
5513 // void load_data(void) method
5514 
5516 
5518 {
5519  if(data_file_name.empty())
5520  {
5521  std::ostringstream buffer;
5522 
5523  buffer << "OpenNN Exception: DataSet class.\n"
5524  << "void load_data(void) method.\n"
5525  << "Data file name has not been set.\n";
5526 
5527  throw std::logic_error(buffer.str());
5528  }
5529 
5530  std::ifstream file(data_file_name.c_str());
5531 
5532  if(!file.is_open())
5533  {
5534  std::ostringstream buffer;
5535 
5536  buffer << "OpenNN Exception: DataSet class.\n"
5537  << "void load_data(void) method.\n"
5538  << "Cannot open data file: " << data_file_name << "\n";
5539 
5540  throw std::logic_error(buffer.str());
5541  }
5542 
5543  file.close();
5544 
5545  const Vector< Vector<std::string> > nominal_labels = set_from_data_file();
5546 
5547  read_from_data_file(nominal_labels);
5548 
5549  // Variables name
5550 
5551  Vector<std::string> columns_name;
5552 
5553  if(header_line)
5554  {
5555  columns_name = read_header_line();
5556  }
5557  else
5558  {
5559  for(unsigned i = 0; i < nominal_labels.size(); i++)
5560  {
5561  std::ostringstream buffer;
5562 
5563  buffer << "variable_" << i;
5564 
5565  columns_name.push_back(buffer.str());
5566  }
5567  }
5568 
5569  variables.set_names(columns_name, nominal_labels);
5570 
5571  // Angular variables
5572 
5573  if(!angular_variables.empty())
5574  {
5576  }
5577 
5578  // Time series
5579 
5580  if(lags_number != 0)
5581  {
5583  }
5584 
5585  // Association
5586 
5587  if(association)
5588  {
5590  }
5591 }
5592 
5593 
5595 
5597 {
5598  std::ifstream file;
5599 
5600  file.open(data_file_name.c_str(), std::ios::binary);
5601 
5602  if(!file.is_open())
5603  {
5604  std::ostringstream buffer;
5605 
5606  buffer << "OpenNN Exception: DataSet class.\n"
5607  << "void load_data_binary(void) method.\n"
5608  << "Cannot open data file: " << data_file_name << "\n";
5609 
5610  throw std::logic_error(buffer.str());
5611  }
5612 
5613  std::streamsize size = sizeof(size_t);
5614 
5615  size_t variables_number;
5616  size_t instances_number;
5617 
5618  file.read(reinterpret_cast<char*>(&variables_number), size);
5619  file.read(reinterpret_cast<char*>(&instances_number), size);
5620 
5621  size = sizeof(double);
5622 
5623  double value;
5624 
5625  data.set(instances_number, variables_number);
5626 
5627  for(size_t i = 0; i < variables_number*instances_number; i++)
5628  {
5629  file.read(reinterpret_cast<char*>(&value), size);
5630 
5631  data[i] = value;
5632  }
5633 
5634  file.close();
5635 }
5636 
5637 
5639 
5641 {
5642  std::ifstream file;
5643 
5644  file.open(data_file_name.c_str(), std::ios::binary);
5645 
5646  if(!file.is_open())
5647  {
5648  std::ostringstream buffer;
5649 
5650  buffer << "OpenNN Exception: DataSet class.\n"
5651  << "void load_data_binary(void) method.\n"
5652  << "Cannot open data file: " << data_file_name << "\n";
5653 
5654  throw std::logic_error(buffer.str());
5655  }
5656 
5657  std::streamsize size = sizeof(size_t);
5658 
5659  size_t variables_number;
5660  size_t instances_number;
5661 
5662  file.read(reinterpret_cast<char*>(&variables_number), size);
5663  file.read(reinterpret_cast<char*>(&instances_number), size);
5664 
5665  size = sizeof(double);
5666 
5667  double value;
5668 
5669  data.set(instances_number, variables_number);
5670 
5671  for(size_t i = 0; i < variables_number*instances_number; i++)
5672  {
5673  file.read(reinterpret_cast<char*>(&value), size);
5674 
5675  data[i] = value;
5676  }
5677 
5678 // if(get_missing_values().has_missing_values())
5679 // {
5680 // scrub_missing_values();
5681 // }
5682 
5683  size = sizeof(size_t);
5684 
5685  size_t time_series_instances_number;
5686  size_t time_series_variables_number;
5687 
5688  file.read(reinterpret_cast<char*>(&time_series_instances_number), size);
5689  file.read(reinterpret_cast<char*>(&time_series_variables_number), size);
5690 
5691  time_series_data.set(time_series_instances_number, time_series_variables_number);
5692 
5693  size = sizeof(double);
5694 
5695  for(size_t i = 0; i < time_series_instances_number*time_series_variables_number; i++)
5696  {
5697  file.read(reinterpret_cast<char*>(&value), size);
5698 
5699  time_series_data[i] = value;
5700  }
5701 
5702  file.close();
5703 }
5704 
5705 
5706 // void load_time_series_data(void) method
5707 
5709 /*
5710 void DataSet::load_time_series_data(void)
5711 {
5712  if(lags_number <= 0)
5713  {
5714  std::ostringstream buffer;
5715 
5716  buffer << "OpenNN Exception: DataSet class.\n"
5717  << "void load_time_series_data(void) const method.\n"
5718  << "Number of lags (" << lags_number << ") must be greater than zero.\n";
5719 
5720  throw std::logic_error(buffer.str());
5721  }
5722 
5723 
5724  if(header)
5725  {
5726 // Vector<std::string> columns_name;
5727 
5728 // variables.set_names(names);
5729  }
5730 
5731 
5732  const Matrix<double> time_series_data(data_file_name);
5733 
5734  const size_t rows_number = time_series_data.get_rows_number();
5735  const size_t columns_number = time_series_data.get_columns_number();
5736 
5737  const size_t instances_number = rows_number - lags_number;
5738  const size_t variables_number = columns_number*(1 + lags_number);
5739 
5740  set(variables_number, instances_number);
5741 
5742  Vector<double> row(rows_number);
5743 
5744  for(size_t i = 0; i < instances_number; i++)
5745  {
5746  row = time_series_data.arrange_row(i);
5747 
5748  for(size_t j = 1; j <= lags_number; j++)
5749  {
5750  row = row.assemble(time_series_data.arrange_row(i+j));
5751  }
5752 
5753  data.set_row(i, row);
5754  }
5755 
5756  // Variables
5757 
5758  Vector<Variables::Use> uses(variables_number);
5759 
5760  std::fill(uses.begin(), uses.begin()+lags_number*variables_number/(lags_number+1)-1, Variables::Use::Input);
5761  std::fill(uses.begin()+lags_number*variables_number/(lags_number+1), uses.end(), Variables::Use::Target);
5762 
5763  variables.set_uses(uses);
5764 
5765 }
5766 */
5767 
5768 // Vector<size_t> calculate_target_distribution(void) const method
5769 
5774 
5776 {
5777  // Control sentence (if debug)
5778 
5779  const size_t instances_number = instances.get_instances_number();
5780  const size_t targets_number = variables.count_targets_number();
5781  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
5782 
5783  Vector<size_t> class_distribution;
5784 
5785  if(targets_number == 1) // Two classes
5786  {
5787  class_distribution.set(2, 0);
5788 
5789  size_t target_index = targets_indices[0];
5790 
5791  for(size_t instance_index = 0; instance_index < instances_number; instance_index++)
5792  {
5793  if(missing_values.arrange_missing_indices()[target_index].contains(instance_index))
5794  {
5795  continue;
5796  }
5797  if(instances.get_use(instance_index) != Instances::Unused)
5798  {
5799  if(data(instance_index,target_index) < 0.5)
5800  {
5801  class_distribution[0]++;
5802  }
5803  else
5804  {
5805  class_distribution[1]++;
5806  }
5807  }
5808  }
5809  }
5810  else // More than two classes
5811  {
5812  class_distribution.set(targets_number, 0);
5813 
5814  for(size_t i = 0; i < instances_number; i++)
5815  {
5816  if(instances.get_use(i) != Instances::Unused)
5817  {
5818  for(size_t j = 0; j < targets_number; j++)
5819  {
5820  if(data(i,targets_indices[j]) == -123.456)
5821  {
5822  continue;
5823  }
5824 
5825  if(data(i,targets_indices[j]) > 0.5)
5826  {
5827  class_distribution[j]++;
5828  }
5829  }
5830  }
5831  }
5832  }
5833 
5834  // Check data consistency
5835 
5836 // const size_t used_instances_number = instances.count_used_instances_number();
5837 
5838 // if(class_distribution.calculate_sum() != used_instances_number)
5839 // {
5840 // std::ostringstream buffer;
5841 
5842 // buffer << "OpenNN Exception: DataSet class.\n"
5843 // << "Vector<size_t> calculate_target_distribution(void) const method.\n"
5844 // << "Sum of class distributions (" << class_distribution << ") is not equal to "
5845 // << "number of used instances (" << used_instances_number << ")." << std::endl;
5846 
5847 // throw std::logic_error(buffer.str());
5848 // }
5849 
5850  return(class_distribution);
5851 }
5852 
5853 
5854 // Vector<double> calculate_distances(void) const method
5855 
5858 
5860 {
5861  const Matrix<double> data_statistics_matrix = calculate_data_statistics_matrix();
5862 
5863  const Vector<double> means = data_statistics_matrix.arrange_column(2);
5864  const Vector<double> standard_deviations = data_statistics_matrix.arrange_column(3);
5865 
5866  const size_t instances_number = instances.get_instances_number();
5867  Vector<double> distances(instances_number);
5868 
5869  const size_t variables_number = variables.get_variables_number();
5870  Vector<double> instance(variables_number);
5871 
5872  int i = 0;
5873 
5874  #pragma omp parallel for private(i, instance)
5875 
5876  for(i = 0; i < (int)instances_number; i++)
5877  {
5878  instance = data.arrange_row(i);
5879 
5880  distances[i] = (instance-means/standard_deviations).calculate_norm();
5881  }
5882 
5883  return(distances);
5884 }
5885 
5886 
5887 // Vector<size_t> balance_binary_targets_class_distribution(void) method
5888 
5893 
5895 {
5896  Vector<size_t> unused_instances;
5897 
5898  const size_t instances_number = instances.count_used_instances_number();
5899 
5900  const Vector<size_t> target_class_distribution = calculate_target_distribution();
5901 
5902  const Vector<size_t> maximal_indices = target_class_distribution.calculate_maximal_indices(2);
5903 
5904  const size_t maximal_target_class_index = maximal_indices[0];
5905  const size_t minimal_target_class_index = maximal_indices[1];
5906 
5907  size_t total_unbalanced_instances_number = (size_t)((percentage/100.0)*(target_class_distribution[maximal_target_class_index] - target_class_distribution[minimal_target_class_index]));
5908 
5909  size_t actual_unused_instances_number;
5910 
5911  size_t unbalanced_instances_number = total_unbalanced_instances_number/10;
5912 
5913  Vector<size_t> actual_unused_instances;
5914 
5915 // std::cout << "Instances to unuse: " << total_unbalanced_instances_number << std::endl;
5916 
5917  while(total_unbalanced_instances_number != 0)
5918  {
5919  if(total_unbalanced_instances_number < instances_number/10)
5920  {
5921  unbalanced_instances_number = total_unbalanced_instances_number;
5922  }
5923  else if(total_unbalanced_instances_number > 0 && unbalanced_instances_number < 1)
5924  {
5925  unbalanced_instances_number = total_unbalanced_instances_number;
5926  }
5927 
5928  actual_unused_instances = unuse_most_populated_target(unbalanced_instances_number);
5929 
5930  actual_unused_instances_number = actual_unused_instances.size();
5931 
5932  unused_instances = unused_instances.assemble(actual_unused_instances);
5933 
5934  total_unbalanced_instances_number = total_unbalanced_instances_number - actual_unused_instances_number;
5935 
5936  actual_unused_instances.clear();
5937  }
5938 
5939  return (unused_instances);
5940 }
5941 
5942 
5943 // Vector<size_t> balance_multiple_targets_distribution(void) method
5944 
5948 
5950 {
5951  Vector<size_t> unused_instances;
5952 
5953  const size_t bins_number = 10;
5954 
5955  const Vector<size_t> target_class_distribution = calculate_target_distribution();
5956 
5957  const size_t targets_number = variables.count_targets_number();
5958 
5959  const Vector<size_t> inputs_variables_indices = variables.arrange_inputs_indices();
5960  const Vector<size_t> targets_variables_indices = variables.arrange_targets_indices();
5961 
5962  const Vector<size_t> maximal_target_class_indices = target_class_distribution.calculate_maximal_indices(targets_number);
5963 
5964  const size_t minimal_target_class_index = maximal_target_class_indices[targets_number - 1];
5965 
5966  // Target class differences
5967 
5968  Vector<size_t> target_class_differences(targets_number);
5969 
5970  for(size_t i = 0; i < targets_number; i++)
5971  {
5972  target_class_differences[i] = target_class_distribution[i] - target_class_distribution[minimal_target_class_index];
5973  }
5974 
5975  Vector<double> instance;
5976 
5977  size_t count_instances = 0;
5978 
5979  size_t unbalanced_instances_number;
5980 
5981  size_t instances_number;
5982  Vector<size_t> instances_indices;
5983 
5984  Vector< Histogram<double> > data_histograms;
5985 
5986  Matrix<size_t> total_frequencies;
5987  Vector<size_t> instance_frequencies;
5988 
5989  size_t maximal_difference_index;
5990  size_t instance_index;
5991  size_t instance_target_index;
5992 
5993  Vector<size_t> unbalanced_instances_indices;
5994 
5995  while(!target_class_differences.is_in(0, 0))
5996  {
5997  unbalanced_instances_indices.clear();
5998  instances_indices.clear();
5999 
6000  instances_indices = instances.arrange_used_indices();
6001 
6002  instances_number = instances_indices.size();
6003 
6004 
6005  maximal_difference_index = target_class_differences.calculate_maximal_index();
6006 
6007  unbalanced_instances_number = (size_t) (target_class_differences[maximal_difference_index]/10);
6008 
6009  if(unbalanced_instances_number < 1)
6010  {
6011  unbalanced_instances_number = 1;
6012  }
6013 
6014  data_histograms = calculate_data_histograms(bins_number);
6015 
6016  total_frequencies.clear();
6017 
6018  total_frequencies.set(instances_number, 2);
6019 
6020  count_instances = 0;
6021 
6022  for(size_t i = 0; i < instances_number; i++)
6023  {
6024  instance_index = instances_indices[i];
6025 
6026  instance = get_instance(instance_index);
6027 
6028  instance_target_index = targets_variables_indices[maximal_difference_index];
6029 
6030  if(instance[instance_target_index] == 1.0)
6031  {
6032  instance_frequencies = instance.calculate_total_frequencies(data_histograms);
6033 
6034  total_frequencies(count_instances, 0) = instance_frequencies.calculate_partial_sum(inputs_variables_indices);
6035  total_frequencies(count_instances, 1) = instance_index;
6036 
6037  count_instances++;
6038  }
6039  }
6040 
6041  unbalanced_instances_indices = total_frequencies.sort_greater_rows(0).arrange_column(1).arrange_subvector_first(unbalanced_instances_number);
6042 
6043  unused_instances = unused_instances.assemble(unbalanced_instances_indices);
6044 
6045  instances.set_unused(unbalanced_instances_indices);
6046 
6047  target_class_differences[maximal_difference_index] = target_class_differences[maximal_difference_index] - unbalanced_instances_number;
6048  }
6049 
6050  return(unused_instances);
6051 }
6052 
6053 
6054 // Vector<size_t> unuse_most_populated_target(const size_t&)
6055 
6061 
6062 Vector<size_t> DataSet::unuse_most_populated_target(const size_t& instances_to_unuse)
6063 {
6064  Vector<size_t> most_populated_instances(instances_to_unuse);
6065 
6066  if(instances_to_unuse == 0)
6067  {
6068  return(most_populated_instances);
6069  }
6070 
6071  const size_t bins_number = 10;
6072 
6073  // Variables
6074 
6075  const size_t targets_number = variables.count_targets_number();
6076 
6079 
6080  const Vector<size_t> unused_variables = variables.arrange_unused_indices();
6081 
6082  // Instances
6083 
6084  const Vector<size_t> used_instances = instances.arrange_used_indices();
6085 
6086  const size_t used_instances_number = instances.count_used_instances_number();
6087 
6088  // Most populated target
6089 
6090  const Vector< Histogram<double> > data_histograms = calculate_data_histograms(bins_number);
6091 
6092  size_t most_populated_target = 0;
6093  size_t most_populated_bin = 0;
6094 
6095  size_t frequency;
6096  size_t maximum_frequency = 0;
6097 
6098  size_t unused = 0;
6099 
6100  for(size_t i = 0; i < targets_number; i++)
6101  {
6102  frequency = data_histograms[targets[i] - unused_variables.count_less_than(targets[i])].calculate_maximum_frequency();
6103 
6104  if(frequency > maximum_frequency)
6105  {
6106  unused = unused_variables.count_less_than(targets[i]);
6107 
6108  maximum_frequency = frequency;
6109 
6110  most_populated_target = targets[i];
6111 
6112  most_populated_bin = data_histograms[targets[i] - unused].calculate_most_populated_bin();
6113  }
6114  }
6115 
6116  // Calculates frequencies of the instances which belong to the most populated target
6117 
6118  size_t index;
6119  size_t bin;
6120  double value;
6121  Vector<double> instance;
6122 
6123  Vector<size_t> instance_frequencies;
6124 
6125  Matrix<size_t> total_instances_frequencies(maximum_frequency, 2);
6126 
6127  size_t count_instances = 0;
6128 
6129  int i = 0;
6130 
6131 // #pragma omp parallel for private(i, instance_index, instance, instance_value, instance_bin, instance_frequencies) shared(count_instances, total_instances_frequencies, maximal_frequency_target_index)
6132 
6133  for(i = 0; i < (int)used_instances_number; i++)
6134  {
6135  index = used_instances[i];
6136 
6137  instance = get_instance(index);
6138 
6139  value = instance[most_populated_target];
6140 
6141  bin = data_histograms[most_populated_target - unused].calculate_bin(value);
6142 
6143  if(bin == most_populated_bin)
6144  {
6145  instance_frequencies = instance.calculate_total_frequencies(data_histograms);
6146 
6147  total_instances_frequencies(count_instances, 0) = instance_frequencies.calculate_partial_sum(inputs);
6148  total_instances_frequencies(count_instances, 1) = used_instances[i];
6149 
6150  count_instances++;
6151  }
6152  }
6153 
6154  // Unuses instances
6155 
6156  if(instances_to_unuse > maximum_frequency)
6157  {
6158  most_populated_instances = total_instances_frequencies.sort_greater_rows(0).arrange_column(1).arrange_subvector_first(maximum_frequency);
6159  }
6160  else
6161  {
6162  most_populated_instances = total_instances_frequencies.sort_greater_rows(0).arrange_column(1).arrange_subvector_first(instances_to_unuse);
6163  }
6164 
6165  instances.set_unused(most_populated_instances);
6166 
6167  return(most_populated_instances);
6168 }
6169 
6170 
6171 // Vector<size_t> balance_approximation_targets_distribution(void)
6172 
6177 
6179 {
6180  Vector<size_t> unused_instances;
6181 
6182  const size_t instances_number = instances.count_used_instances_number();
6183 
6184  const size_t instances_to_unuse = (size_t)(instances_number*percentage/100.0);
6185 
6186  size_t count;
6187 
6188  while(unused_instances.size() < instances_to_unuse)
6189  {
6190  if(instances_to_unuse - unused_instances.size() < instances_to_unuse/10)
6191  {
6192  count = instances_to_unuse - unused_instances.size();
6193  }
6194  else
6195  {
6196  count = instances_to_unuse/10;
6197  }
6198 
6199  if(count == 0)
6200  {
6201  count = 1;
6202  }
6203 
6204  unused_instances = unused_instances.assemble(unuse_most_populated_target(count));
6205  }
6206 
6207  return(unused_instances);
6208 }
6209 
6210 
6211 // Vector<size_t> arrange_binary_inputs_indices(void) const method
6212 
6214 
6216 {
6217  const size_t inputs_number = variables.count_inputs_number();
6218 
6219  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
6220 
6221  Vector<size_t> binary_inputs_indices;
6222 
6223  for(size_t i = 0; i < inputs_number; i++)
6224  {
6225  if(is_binary_variable(inputs_indices[i]))
6226  {
6227  binary_inputs_indices.push_back(inputs_indices[i]);
6228  }
6229  }
6230 
6231  return(binary_inputs_indices);
6232 }
6233 
6234 
6235 // Vector<size_t> arrange_real_inputs_indices(void) const method
6236 
6238 
6240 {
6241  const size_t inputs_number = variables.count_inputs_number();
6242 
6243  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
6244 
6245  Vector<size_t> real_inputs_indices;
6246 
6247  for(size_t i = 0; i < inputs_number; i++)
6248  {
6249  if(!is_binary_variable(inputs_indices[i]))
6250  {
6251  real_inputs_indices.push_back(inputs_indices[i]);
6252  }
6253  }
6254 
6255  return(real_inputs_indices);
6256 }
6257 
6258 
6259 // void sum_binary_inputs(void) method
6260 
6262 
6264 {
6265 // const size_t inputs_number = variables.count_inputs_number();
6266 
6267 // const size_t instances_number = instances.get_instances_number();
6268 
6269 // const Vector<size_t> binary_inputs_indices = arrange_binary_inputs_indices();
6270 
6271 // const size_t binary_inputs_number = binary_inputs_indices.size();
6272 
6273 // Vector<double> binary_variable(instances_number, 0.0);
6274 
6275 // for(size_t i = 0; i < binary_inputs_number; i++)
6276 // {
6277 // binary_variable += data.arrange_column(binary_inputs_indices[i]);
6278 // }
6279 
6280 // const Vector<size_t> real_inputs_indices = arrange_real_inputs_indices();
6281 
6282 // Matrix<double> new_data = data.arrange_submatrix_columns(real_inputs_indices);
6283 
6284 // new_data.append_column(binary_variable);
6285 
6286 // new_data = new_data.assemble_columns(arrange_target_data());
6287 }
6288 
6289 
6290 // Vector<double> calculate_distances(const size_t&) const
6291 
6296 
6297 Matrix<double> DataSet::calculate_instances_distances(const size_t& nearest_neighbours_number) const
6298 {
6299  const size_t instances_number = instances.count_used_instances_number();
6300  const Vector<size_t> instances_indices = instances.arrange_used_indices();
6301 
6302  //Matrix<double> distances(instances_number, instances_number, 0.0);
6303 
6304  Vector<double> distances(nearest_neighbours_number , -1.0);
6305 
6306  Vector<double> instance;
6307  Vector<double> other_instance;
6308 
6309  size_t maximal_index;
6310 
6311  double distance;
6312 
6313 // #pragma omp parallel for private(maximal_index, distance) collapse(2)
6314 
6315  for(size_t i = 0; i < instances_number; i++)
6316  {
6317  for(size_t j = 0; j < instances_number; j++)
6318  {
6319 
6320  distance = data.calculate_distance(instances_indices[i], instances_indices[j]);
6321 
6322  if(distances.count_greater_than(distance) != 0)
6323  {
6324  maximal_index = distances.calculate_maximal_index();
6325 
6326  distances[maximal_index] = distance;
6327  }
6328  }
6329  }
6330 
6331  Matrix<double> distances1(instances_number, instances_number, 0.0);
6332  return(distances1);
6333 }
6334 
6335 
6336 // Matrix<size_t> calculate_nearest_neighbors(const Matrix<double>&, const size_t&) const
6337 
6343 
6344 Matrix<size_t> DataSet::calculate_nearest_neighbors(const Matrix<double>& distances, const size_t& nearest_neighbours_number) const
6345 {
6346  const size_t instances_number = instances.count_used_instances_number();
6347 
6348  Matrix<size_t> nearest_neighbors(instances_number, nearest_neighbours_number);
6349 
6350  Vector<double> instance_distances;
6351  Vector<size_t> minimal_distances_indices;
6352 
6353  for(size_t i = 0; i < instances_number; i++)
6354  {
6355  instance_distances = distances.arrange_row(i);
6356  minimal_distances_indices = instance_distances.calculate_minimal_indices(nearest_neighbours_number + 1);
6357 
6358  for(size_t j = 0; j < nearest_neighbours_number; j++)
6359  {
6360  nearest_neighbors(i, j) = minimal_distances_indices[j + 1];
6361  }
6362  }
6363 
6364  return(nearest_neighbors);
6365 }
6366 
6367 
6368 // Vector<double> calculate_k_distances(const Matrix<double>&) const
6369 
6374 
6375 Vector<double> DataSet::calculate_k_distances(const Matrix<double>& distances, const size_t& nearest_neighbours_number) const
6376 {
6377  const size_t instances_number = instances.count_used_instances_number();
6378 
6379  const Matrix<size_t> nearest_neighbors = calculate_nearest_neighbors(distances, nearest_neighbours_number);
6380 
6381  size_t maximal_index;
6382 
6383  Vector<double> k_distances(instances_number);
6384 
6385  for(size_t i = 0; i < instances_number; i++)
6386  {
6387  maximal_index = nearest_neighbors(i, nearest_neighbours_number - 1);
6388 
6389  k_distances[i] = distances.arrange_row(i)[maximal_index];
6390  }
6391 
6392  return(k_distances);
6393 }
6394 
6395 
6396 // Matrix<double> calculate_reachability_distance(const Matrix<double>&, Vector<double>&) const
6397 
6401 
6403 {
6404  const size_t instances_number = instances.count_used_instances_number();
6405 
6406  Matrix<double> reachability_distances(instances_number, instances_number);
6407 
6408  for(size_t i = 0; i < instances_number; i++)
6409  {
6410  for(size_t j = i; j < instances_number; j++)
6411  {
6412  if(distances(i, j) <= k_distances[i])
6413  {
6414  reachability_distances(i, j) = k_distances[i];
6415  reachability_distances(j, i) = k_distances[i];
6416  }
6417  else if(distances(i, j) > k_distances[i])
6418  {
6419  reachability_distances(i, j) = distances(i, j);
6420  reachability_distances(j, i) = distances(j, i);
6421  }
6422  }
6423  }
6424 
6425  return (reachability_distances);
6426 }
6427 
6428 
6429 // Vector<double> calculate_reachability_density(const Matrix<double>&, const size_t&) const
6430 
6434 
6435 Vector<double> DataSet::calculate_reachability_density(const Matrix<double>& distances, const size_t& nearest_neighbours_number) const
6436 {
6437  const size_t instances_number = instances.count_used_instances_number();
6438 
6439  const Vector<double> k_distances = calculate_k_distances(distances, nearest_neighbours_number);
6440 
6441  const Matrix<double> reachability_distances = calculate_reachability_distances(distances, k_distances);
6442 
6443  const Matrix<size_t> nearest_neighbors_indices = calculate_nearest_neighbors(distances, nearest_neighbours_number);
6444 
6445  Vector<double> reachability_density(instances_number);
6446 
6447  Vector<size_t> nearest_neighbors_instance;
6448 
6449  for(size_t i = 0; i < instances_number; i++)
6450  {
6451  nearest_neighbors_instance = nearest_neighbors_indices.arrange_row(i);
6452 
6453  reachability_density[i] = nearest_neighbours_number/ reachability_distances.arrange_row(i).calculate_partial_sum(nearest_neighbors_instance);
6454  }
6455 
6456  return (reachability_density);
6457 
6458 }
6459 
6460 
6461 // Vector<double> calculate_local_outlier_factor(const size_t&) const
6462 
6465 
6466 Vector<double> DataSet::calculate_local_outlier_factor(const size_t& nearest_neighbours_number) const
6467 {
6468  const size_t instances_number = instances.count_used_instances_number();
6469 
6470  const Matrix<double> distances = calculate_instances_distances(nearest_neighbours_number);
6471  const Vector<double> reachability_density = calculate_reachability_density(distances, nearest_neighbours_number);
6472 
6473  const Matrix<size_t> nearest_neighbors = calculate_nearest_neighbors(distances, nearest_neighbours_number);
6474 
6475  Vector<size_t> instance_nearest_neighbors(nearest_neighbours_number);
6476 
6477  Vector<double> local_outlier_factor(instances_number);
6478 
6479  for(size_t i = 0; i < instances_number; i++)
6480  {
6481  instance_nearest_neighbors = nearest_neighbors.arrange_row(i);
6482 
6483  local_outlier_factor[i] = (reachability_density.calculate_partial_sum(instance_nearest_neighbors))/(nearest_neighbours_number*reachability_density[i]);
6484  }
6485 
6486  return (local_outlier_factor);
6487 }
6488 
6489 
6490 // Vector<size_t> clean_local_outlier_factor(const size_t&)
6491 
6495 
6496 Vector<size_t> DataSet::clean_local_outlier_factor(const size_t& nearest_neighbours_number)
6497 {
6498  Vector<size_t> unused_instances;
6499 
6500  const Vector<double> local_outlier_factor = calculate_local_outlier_factor(nearest_neighbours_number);
6501 
6502  const size_t instances_number = instances.count_used_instances_number();
6503  const Vector<size_t> instances_indices = instances.arrange_used_indices();
6504 
6505  for(size_t i = 0; i < instances_number; i++)
6506  {
6507  if(local_outlier_factor[i] > 1.6001)
6508  {
6509  instances.set_use(instances_indices[i], Instances::Unused);
6510 
6511  unused_instances.push_back(instances_indices[i]);
6512  }
6513  }
6514 
6515  return(unused_instances);
6516 }
6517 
6518 // Vector<size_t> calculate_Tukey_outliers(const size_t&, const double&) const method
6519 
6523 
6524 Vector<size_t> DataSet::calculate_Tukey_outliers(const size_t& variable_index, const double& cleaning_parameter) const
6525 {
6526  const size_t instances_number = instances.count_used_instances_number();
6527  const Vector<size_t> instances_indices = instances.arrange_used_indices();
6528 
6529  double interquartile_range;
6530 
6531  Vector<size_t> unused_instances_indices;
6532 
6533  if(is_binary_variable(variable_index))
6534  {
6535  return(unused_instances_indices);
6536  }
6537 
6538  const Vector<double> box_plot = data.arrange_column(variable_index).calculate_box_plots();
6539 
6540  if(box_plot[3] == box_plot[1])
6541  {
6542  return(unused_instances_indices);
6543  }
6544  else
6545  {
6546  interquartile_range = std::abs((box_plot[3] - box_plot[1]));
6547  }
6548 
6549  for(size_t j = 0; j < instances_number; j++)
6550  {
6551  const Vector<double> instance = get_instance(instances_indices[j]);
6552 
6553  if(instance[variable_index] < (box_plot[1] - cleaning_parameter*interquartile_range))
6554  {
6555  unused_instances_indices.push_back(instances_indices[j]);
6556  }
6557  else if(instance[variable_index] > (box_plot[3] + cleaning_parameter*interquartile_range))
6558  {
6559  unused_instances_indices.push_back(instances_indices[j]);
6560  }
6561  }
6562 
6563  return(unused_instances_indices);
6564 }
6565 
6566 
6567 // Vector< Vector<size_t> > calculate_Tukey_outliers(const double&) const
6568 
6571 
6572 Vector< Vector<size_t> > DataSet::calculate_Tukey_outliers(const double& cleaning_parameter) const
6573 {
6574  const size_t instances_number = instances.count_used_instances_number();
6575  const Vector<size_t> instances_indices = instances.arrange_used_indices();
6576 
6577  const size_t variables_number = variables.count_used_variables_number();
6578  const Vector<size_t> used_variables_indices = variables.arrange_used_indices();
6579 
6580  double interquartile_range;
6581 
6582  Vector< Vector<size_t> > return_values(2);
6583  return_values[0] = Vector<size_t>(instances_number, 0);
6584  return_values[1] = Vector<size_t>(variables_number, 0);
6585 
6586  size_t variable_index;
6587 
6588  Vector< Vector<double> > box_plots(variables_number);
6589 
6590 #pragma omp parallel for private(variable_index) schedule(dynamic)
6591 
6592  for(int i = 0; i < (int)variables_number; i++)
6593  {
6594  variable_index = used_variables_indices[i];
6595 
6596  if(is_binary_variable(variable_index))
6597  {
6598  continue;
6599  }
6600 
6601  box_plots[i] = data.arrange_column(variable_index).calculate_box_plots();
6602  }
6603 
6604  for(int i = 0; i < (int)variables_number; i++)
6605  {
6606  variable_index = used_variables_indices[i];
6607 
6608  if(is_binary_variable(variable_index))
6609  {
6610  continue;
6611  }
6612 
6613  const Vector<double> variable_box_plot = box_plots[i];
6614 
6615  if(variable_box_plot[3] == variable_box_plot[1])
6616  {
6617  continue;
6618  }
6619  else
6620  {
6621  interquartile_range = std::abs((variable_box_plot[3] - variable_box_plot[1]));
6622  }
6623 
6624  size_t variables_outliers = 0;
6625 
6626 #pragma omp parallel for schedule(dynamic) reduction(+ : variables_outliers)
6627 
6628  for(int j = 0; j < instances_number; j++)
6629  {
6630  const Vector<double> instance = get_instance(instances_indices[j]);
6631 
6632  if(instance[variable_index] < (variable_box_plot[1] - cleaning_parameter*interquartile_range) ||
6633  instance[variable_index] > (variable_box_plot[3] + cleaning_parameter*interquartile_range))
6634  {
6635  return_values[0][j] = 1;
6636 
6637  variables_outliers++;
6638  }
6639  }
6640 
6641  return_values[1][i] = variables_outliers;
6642  }
6643 
6644  return(return_values);
6645 }
6646 
6647 
6648 // Matrix<double> calculate_autocorrelation(const size_t&) const method
6649 
6654 
6655 Matrix<double> DataSet::calculate_autocorrelation(const size_t& maximum_lags_number) const
6656 {
6658  {
6659  std::ostringstream buffer;
6660 
6661  buffer << "OpenNN Exception: DataSet class.\n"
6662  << "Matrix<double> calculate_autocorrelation(const size_t&) method.\n"
6663  << "Maximum lags number (" << maximum_lags_number << ") is greater than the number of instances (" << instances.count_used_instances_number() <<") \n";
6664 
6665  throw std::logic_error(buffer.str());
6666  }
6667 
6668  const size_t variables_number = time_series_data.get_columns_number();
6669 
6670  Matrix<double> autocorrelation(variables_number, maximum_lags_number);
6671 
6672  for(size_t i = 0; i < maximum_lags_number; i++)
6673  {
6674  for(size_t j = 0; j < variables_number; j++)
6675  {
6676  autocorrelation.set_row(j, time_series_data.arrange_column(j).calculate_autocorrelation(maximum_lags_number));
6677  }
6678  }
6679 
6680  return autocorrelation;
6681 }
6682 
6683 
6684 // Matrix< Vector<double> > calculate_cross_correlation(void)
6685 
6687 
6689 {
6690  const size_t variables_number = variables.count_used_variables_number()/(lags_number + steps_ahead);
6691 
6692  Matrix< Vector<double> > cross_correlation(variables_number, variables_number);
6693 
6694  Vector<double> actual_column;
6695 
6696  for(size_t i = 0; i < variables_number; i++)
6697  {
6698  actual_column = time_series_data.arrange_column(i);
6699 
6700  for(size_t j = 0; j < variables_number; j++)
6701  {
6702  cross_correlation(i , j) = actual_column.calculate_cross_correlation(time_series_data.arrange_column(j));
6703  }
6704  }
6705 
6706  return(cross_correlation);
6707 }
6708 
6709 
6710 // void generate_data_approximation(const size_t&, const size_t&) method
6711 
6716 
6717 void DataSet::generate_data_approximation(const size_t& instances_number, const size_t& variables_number)
6718 {
6719  const size_t inputs_number = variables_number-1;
6720  const size_t targets_number = 1;
6721 
6722 // Matrix<double> input_data(instances_number, inputs_number);
6723 // input_data.randomize_uniform(-2.048, 2.048);
6724 
6725 // Matrix<double> target_data(instances_number, targets_number);
6726 
6727 // Matrix<double> new_data(instances_number, inputs_number+targets_number);
6728 
6729  data.set(instances_number, variables_number);
6730 
6731  data.randomize_uniform(-2.048, 2.048);
6732 
6733  double rosenbrock;
6734 
6735  for(size_t i = 0; i < instances_number; i++)
6736  {
6737  data(i, inputs_number) = data.arrange_row(i, Vector<size_t>(0,1,inputs_number-1)).calculate_norm();
6738 
6739  rosenbrock = 0.0;
6740 
6741  for(size_t j = 0; j < inputs_number-1; j++)
6742  {
6743  rosenbrock +=
6744  (1.0 - data(i,j))*(1.0 - data(i,j))
6745  + 100.0*(data(i,j+1)-data(i,j)*data(i,j))*(data(i,j+1)-data(i,j)*data(i,j));
6746  }
6747 
6748  data(i, inputs_number) = rosenbrock;
6749 // target_data(i, 0) = input_data.arrange_row(i).calculate_norm();
6750 
6751 // rosenbrock = 0.0;
6752 
6753 // for(size_t j = 0; j < inputs_number-1; j++)
6754 // {
6755 // rosenbrock +=
6756 // (1.0 - input_data(i,j))*(1.0 - input_data(i,j))
6757 // + 100.0*(input_data(i,j+1)-input_data(i,j)*input_data(i,j))*(input_data(i,j+1)-input_data(i,j)*input_data(i,j));
6758 // }
6759 
6760 // target_data(i, 0) = rosenbrock;
6761  }
6762 
6763 // set(input_data.assemble_columns(target_data));
6764 
6765 // set(new_data);
6766 
6768 }
6769 
6770 
6771 // void generate_data_binary_classification(const size_t&, const size_t&) method
6772 
6776 
6777 void DataSet::generate_data_binary_classification(const size_t& instances_number, const size_t& inputs_number)
6778 {
6779  const size_t negatives = instances_number/2;
6780  const size_t positives = instances_number - negatives;
6781 
6782  // Negatives data
6783 
6784  Vector<double> target_0(negatives, 0.0);
6785 
6786  Matrix<double> class_0(negatives, inputs_number);
6787 
6788  class_0.randomize_normal(-0.5, 1.0);
6789 
6790  class_0.append_column(target_0);
6791 
6792  // Positives data
6793 
6794  Vector<double> target_1(positives, 1.0);
6795 
6796  Matrix<double> class_1(positives, inputs_number);
6797 
6798  class_1.randomize_normal(0.5, 1.0);
6799 
6800  class_1.append_column(target_1);
6801 
6802  // Assemble
6803 
6804  set(class_0.assemble_rows(class_1));
6805 
6806 }
6807 
6808 
6809 // void generate_data_multiple_classification(const size_t&, const size_t&) method
6810 
6812 
6813 void DataSet::generate_data_multiple_classification(const size_t&, const size_t&)
6814 {
6815 
6816 }
6817 
6818 
6819 // bool has_data(void) const method
6820 
6823 
6824 bool DataSet::has_data(void) const
6825 {
6826  if(data.empty())
6827  {
6828  return(false);
6829  }
6830  else
6831  {
6832  return(true);
6833  }
6834 }
6835 
6836 
6837 // Vector<size_t> filter_data(const Vector<double>&, const Vector<double>&) method
6838 
6844 
6846 {
6847  const size_t variables_number = variables.get_variables_number();
6848 
6849  // Control sentence (if debug)
6850 
6851  #ifdef __OPENNN_DEBUG__
6852 
6853  if(minimums.size() != variables_number)
6854  {
6855  std::ostringstream buffer;
6856 
6857  buffer << "OpenNN Exception: DataSet class.\n"
6858  << "Vector<size_t> filter_data(const Vector<double>&, const Vector<double>&) method.\n"
6859  << "Size of minimums (" << minimums.size() << ") is not equal to number of variables (" << variables_number << ").\n";
6860 
6861  throw std::logic_error(buffer.str());
6862  }
6863 
6864  if(maximums.size() != variables_number)
6865  {
6866  std::ostringstream buffer;
6867 
6868  buffer << "OpenNN Exception: DataSet class.\n"
6869  << "Vector<size_t> filter_data(const Vector<double>&, const Vector<double>&) method.\n"
6870  << "Size of maximums (" << maximums.size() << ") is not equal to number of variables (" << variables_number << ").\n";
6871 
6872  throw std::logic_error(buffer.str());
6873  }
6874 
6875  #endif
6876 
6877 // const Vector< Vector<size_t> > missing_indices = missing_values.arrange_missing_indices();
6878 
6879  Vector<size_t> filtered_indices;
6880 
6881  const size_t instances_number = instances.get_instances_number();
6882 
6883 #pragma omp parallel for
6884 
6885  for(int i = 0; i < (int)instances_number; i++)
6886  {
6887  if(instances.is_unused(i))
6888  {
6889  continue;
6890  }
6891 
6892  for(size_t j = 0; j < variables_number; j++)
6893  {
6895  {
6896  continue;
6897  }
6898 
6899  if(data(i,j) < minimums[j] || data(i,j) > maximums[j])
6900  {
6901  #pragma omp critical
6902  {
6903  filtered_indices.push_back(i);
6904  }
6905  instances.set_use(i, Instances::Unused);
6906 
6907  break;
6908  }
6909  }
6910  }
6911 
6912  return(filtered_indices);
6913 }
6914 
6915 
6916 // void convert_angular_variable_degrees(const size_t&) method
6917 
6922 
6923 void DataSet::convert_angular_variable_degrees(const size_t& variable_index)
6924 {
6925  // Control sentence (if debug)
6926 
6927  #ifdef __OPENNN_DEBUG__
6928 
6929  const size_t variables_number = variables.get_variables_number();
6930 
6931  if(variable_index >= variables_number)
6932  {
6933  std::ostringstream buffer;
6934 
6935  buffer << "OpenNN Exception: DataSet class.\n"
6936  << "void convert_angular_variable_degrees(const size_t&) method.\n"
6937  << "Index of variable (" << variable_index << ") must be less than number of variables (" << variables_number << ").\n";
6938 
6939  throw std::logic_error(buffer.str());
6940  }
6941 
6942  #endif
6943 
6945 
6946  Variables::Item sin_item = items[variable_index];
6947  prepend("sin_", sin_item.name);
6948 
6949  Variables::Item cos_item = items[variable_index];
6950  prepend("cos_", cos_item.name);
6951 
6952  items[variable_index] = sin_item;
6953  items = items.insert_element(variable_index, cos_item);
6954 
6955  variables.set_items(items);
6956 
6957  data.convert_angular_variables_degrees(variable_index);
6958 
6959 }
6960 
6961 
6962 // void convert_angular_variable_radians(const size_t&) method
6963 
6968 
6969 void DataSet::convert_angular_variable_radians(const size_t& variable_index)
6970 {
6971  // Control sentence (if debug)
6972 
6973  #ifdef __OPENNN_DEBUG__
6974 
6975  const size_t variables_number = variables.get_variables_number();
6976 
6977  if(variable_index >= variables_number)
6978  {
6979  std::ostringstream buffer;
6980 
6981  buffer << "OpenNN Exception: DataSet class.\n"
6982  << "void convert_angular_variable_radians(const size_t&) method.\n"
6983  << "Index of variable (" << variable_index << ") must be less than number of variables (" << variables_number << ").\n";
6984 
6985  throw std::logic_error(buffer.str());
6986  }
6987 
6988  #endif
6989 
6991 
6992  Variables::Item sin_item = items[variable_index];
6993  prepend("sin_", sin_item.name);
6994 
6995  Variables::Item cos_item = items[variable_index];
6996  prepend("cos_", cos_item.name);
6997 
6998  items[variable_index] = sin_item;
6999  items = items.insert_element(variable_index, cos_item);
7000 
7001  variables.set_items(items);
7002 
7003  data.convert_angular_variables_radians(variable_index);
7004 
7005 }
7006 
7007 
7008 // void convert_angular_variables_degrees(const Vector<size_t>&)
7009 
7014 
7016 {
7017  // Control sentence (if debug)
7018 
7019  #ifdef __OPENNN_DEBUG__
7020 
7021  const size_t variables_number = variables.get_variables_number();
7022 
7023  for(size_t i = 0; i < indices.size(); i++)
7024  {
7025  if(indices[i] >= variables_number)
7026  {
7027  std::ostringstream buffer;
7028 
7029  buffer << "OpenNN Exception: DataSet class.\n"
7030  << "void convert_angular_variables_degrees(const Vector<size_t>&) method.\n"
7031  << "Index (" << i << ") must be less than number of variables (" << variables_number << ").\n";
7032 
7033  throw std::logic_error(buffer.str());
7034  }
7035  }
7036 
7037  #endif
7038 
7039  size_t size = indices.size();
7040 
7041  unsigned count = 0;
7042 
7043  size_t index;
7044 
7045  for(size_t i = 0; i < size; i++)
7046  {
7047  index = indices[i]+count;
7048 
7050 
7051  count++;
7052  }
7053 }
7054 
7055 
7056 // void convert_angular_variables_radians(const Vector<size_t>&)
7057 
7062 
7064 {
7065  // Control sentence (if debug)
7066 
7067  #ifdef __OPENNN_DEBUG__
7068 
7069  const size_t variables_number = variables.get_variables_number();
7070 
7071  for(size_t i = 0; i < indices.size(); i++)
7072  {
7073  if(indices[i] >= variables_number)
7074  {
7075  std::ostringstream buffer;
7076 
7077  buffer << "OpenNN Exception: DataSet class.\n"
7078  << "void convert_angular_variables_radians(const Vector<size_t>&) method.\n"
7079  << "Index (" << i << ") must be less than number of variables (" << variables_number << ").\n";
7080 
7081  throw std::logic_error(buffer.str());
7082  }
7083  }
7084 
7085  #endif
7086 
7087  size_t size = indices.size();
7088 
7089  unsigned count = 0;
7090 
7091  size_t index;
7092 
7093  for(size_t i = 0; i < size; i++)
7094  {
7095  index = indices[i]+count;
7096 
7098 
7099  count++;
7100  }
7101 }
7102 
7103 
7104 // void convert_angular_variables(void) method
7105 
7109 
7111 {
7112  switch(angular_units)
7113  {
7114  case DataSet::Radians:
7115  {
7117  }
7118  break;
7119 
7120  case DataSet::Degrees:
7121  {
7123  }
7124  break;
7125 
7126  default:
7127  {
7128  std::ostringstream buffer;
7129 
7130  buffer << "OpenNN Exception: DataSet class.\n"
7131  << "void convert_angular_variables(void) method.\n"
7132  << "Unknown angular units.\n";
7133 
7134  throw std::logic_error(buffer.str());
7135  }
7136  break;
7137  }
7138 
7139 }
7140 
7141 
7142 // void scrub_missing_values_unuse(void) method
7143 
7145 
7147 {
7148  const Vector<size_t> missing_instances = missing_values.arrange_missing_instances();
7149 
7150  for(size_t i = 0; i < missing_instances.size(); i++)
7151  {
7152  instances.set_use(missing_instances[i], Instances::Unused);
7153  }
7154 }
7155 
7156 
7157 // void scrub_missing_values_mean(void) method
7158 
7160 
7162 {
7164 
7165  const Vector<double> means = data.calculate_mean_missing_values(missing_indices);
7166 
7167  const size_t variables_number = variables.get_variables_number();
7168 
7169  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
7170 
7171  size_t instance_index;
7172 
7173  for(size_t i = 0; i < variables_number; i++)
7174  {
7175  for(size_t j = 0; j < missing_indices[i].size(); j++)
7176  {
7177  instance_index = missing_indices[i][j];
7178 
7179  data(instance_index, i) = means[i];
7180  }
7181  }
7182 }
7183 
7184 
7185 // void scrub_missing_values(void) method
7186 
7190 
7192 {
7194 
7195  switch(scrubbing_method)
7196  {
7197  case MissingValues::Unuse:
7198  {
7200  }
7201  break;
7202 
7203  case MissingValues::Mean:
7204  {
7206  }
7207  break;
7208 
7209  default:
7210  {
7211  std::ostringstream buffer;
7212 
7213  buffer << "OpenNN Exception: DataSet class\n"
7214  << "void scrub_missing_values(void) method.\n"
7215  << "Unknown scrubbing method.\n";
7216 
7217  throw std::logic_error(buffer.str());
7218  }
7219  break;
7220  }
7221 }
7222 
7223 
7224 // size_t count_tokens(std::string& str) const method
7225 
7229 
7230 size_t DataSet::count_tokens(std::string& str) const
7231 {
7232 // if(!(this->find(separator) != std::string::npos))
7233 // {
7234 // std::ostringstream buffer;
7235 //
7236 // buffer << "OpenNN Exception:\n"
7237 // << "std::string class.\n"
7238 // << "inline size_t count_tokens(const std::string&) const method.\n"
7239 // << "Separator not found in string: \"" << separator << "\".\n";
7240 //
7241 // throw std::logic_error(buffer.str());
7242 // }
7243 
7244  trim(str);
7245 
7246  size_t tokens_count = 0;
7247 
7248  // Skip delimiters at beginning.
7249 
7250  const std::string separator_string = get_separator_string();
7251 
7252  std::string::size_type last_pos = str.find_first_not_of(separator_string, 0);
7253 
7254  // Find first "non-delimiter".
7255 
7256  std::string::size_type pos = str.find_first_of(separator_string, last_pos);
7257 
7258  while (std::string::npos != pos || std::string::npos != last_pos)
7259  {
7260  // Found a token, add it to the vector
7261 
7262  tokens_count++;
7263 
7264  // Skip delimiters. Note the "not_of"
7265 
7266  last_pos = str.find_first_not_of(separator_string, pos);
7267 
7268  // Find next "non-delimiter"
7269 
7270  pos = str.find_first_of(separator_string, last_pos);
7271  }
7272 
7273  return(tokens_count);
7274 }
7275 
7276 
7280 
7281 Vector<std::string> DataSet::get_tokens(const std::string& str) const
7282 {
7283  const std::string new_string = get_trimmed(str);
7284 
7285  Vector<std::string> tokens;
7286 
7287  const std::string separator_string = get_separator_string();
7288 
7289  // Skip delimiters at beginning.
7290 
7291  std::string::size_type lastPos = new_string.find_first_not_of(separator_string, 0);
7292 
7293  // Find first "non-delimiter"
7294 
7295  std::string::size_type pos = new_string.find_first_of(separator_string, lastPos);
7296 
7297  while(std::string::npos != pos || std::string::npos != lastPos)
7298  {
7299  // Found a token, add it to the vector
7300 
7301  tokens.push_back(new_string.substr(lastPos, pos - lastPos));
7302 
7303  // Skip delimiters. Note the "not_of"
7304 
7305  lastPos = new_string.find_first_not_of(separator_string, pos);
7306 
7307  // Find next "non-delimiter"
7308 
7309  pos = new_string.find_first_of(separator_string, lastPos);
7310  }
7311 
7312  for(size_t i = 0; i < tokens.size(); i++)
7313  {
7314  trim(tokens[i]);
7315  }
7316 
7317  return(tokens);
7318 }
7319 
7320 
7321 // bool is_numeric(const std::string&) const method
7322 
7325 
7326 bool DataSet::is_numeric(const std::string& str) const
7327 {
7328  std::istringstream iss(str.data());
7329 
7330  double dTestSink;
7331 
7332  iss >> dTestSink;
7333 
7334  // was any input successfully consumed/converted?
7335 
7336  if(!iss)
7337  {
7338  return false;
7339  }
7340 
7341  // was all the input successfully consumed/converted?
7342 
7343  return(iss.rdbuf()->in_avail() == 0);
7344 }
7345 
7346 
7347 // void DataSet::trim(std::string&) const method
7348 
7352 
7353 void DataSet::trim(std::string& str) const
7354 {
7355  //prefixing spaces
7356 
7357  str.erase(0, str.find_first_not_of(' '));
7358 
7359  //surfixing spaces
7360 
7361  str.erase(str.find_last_not_of(' ') + 1);
7362 }
7363 
7364 
7368 
7369 std::string DataSet::get_trimmed(const std::string& str) const
7370 {
7371  std::string output(str);
7372 
7373  //prefixing spaces
7374 
7375  output.erase(0, output.find_first_not_of(' '));
7376 
7377  //surfixing spaces
7378 
7379  output.erase(output.find_last_not_of(' ') + 1);
7380 
7381  return(output);
7382 }
7383 
7384 
7385 // std::string prepend(const std::string&, const std::string&) const method
7386 
7390 
7391 std::string DataSet::prepend(const std::string& pre, const std::string& str) const
7392 {
7393  std::ostringstream buffer;
7394 
7395  buffer << pre << str;
7396 
7397  return(buffer.str());
7398 }
7399 
7400 
7401 // bool is_numeric(const Vector<std::string>&) const
7402 
7405 
7407 {
7408  for(size_t i = 0; i < v.size(); i++)
7409  {
7410  if(!is_numeric(v[i]))
7411  {
7412  return false;
7413  }
7414  }
7415 
7416  return true;
7417 }
7418 
7419 
7420 // bool is_not_numeric(const Vector<std::string>&) const
7421 
7424 
7426 {
7427  for(size_t i = 0; i < v.size(); i++)
7428  {
7429  if(is_numeric(v[i]))
7430  {
7431  return false;
7432  }
7433  }
7434 
7435  return true;
7436 }
7437 
7438 
7439 // bool is_mixed(const Vector<std::string>&) const
7440 
7443 
7445 {
7446  unsigned count_numeric = 0;
7447  unsigned count_not_numeric = 0;
7448 
7449  for(size_t i = 0; i < v.size(); i++)
7450  {
7451  if(is_numeric(v[i]))
7452  {
7453  count_numeric++;
7454  }
7455  else
7456  {
7457  count_not_numeric++;
7458  }
7459  }
7460 
7461  if(count_numeric > 0 && count_not_numeric > 0)
7462  {
7463  return true;
7464  }
7465  else
7466  {
7467  return false;
7468  }
7469 }
7470 
7471 }
7472 
7473 // OpenNN: Open Neural Networks Library.
7474 // Copyright (c) 2005-2016 Roberto Lopez.
7475 //
7476 // This library is free software; you can redistribute it and/or