data_set.cpp
1// OpenNN: Open Neural Networks Library
2// www.opennn.net
3//
4// D A T A S E T C L A S S
5//
6// Artificial Intelligence Techniques SL
7// artelnics@artelnics.com
8
9#include "data_set.h"
10
11using namespace OpenNN;
12
13namespace OpenNN
14{
15
19
21{
22 set();
23
25}
26
27
31
32DataSet::DataSet(const Tensor<type, 2>& data)
33{
34 set(data);
35
37}
38
39
46
47DataSet::DataSet(const Index& new_samples_number, const Index& new_variables_number)
48{
49 set(new_samples_number, new_variables_number);
50
52}
53
54
61
62DataSet::DataSet(const Index& new_samples_number, const Index& new_inputs_number, const Index& new_targets_number)
63{
64 set(new_samples_number, new_inputs_number, new_targets_number);
65
67}
68
69
75
76DataSet::DataSet(const string& data_file_name, const char& separator, const bool& has_columns_names)
77{
79}
80
81
83
85{
86 delete non_blocking_thread_pool;
87 delete thread_pool_device;
88}
89
90
93
94const bool& DataSet::get_display() const
95{
96 return display;
97}
98
99
101
103{
104 name = "";
105 column_use = VariableUse::Input;
106 type = ColumnType::Numeric;
107 categories.resize(0);
108 categories_uses.resize(0);
109
110 scaler = Scaler::MeanStandardDeviation;
111}
112
113
115
116DataSet::Column::Column(const string& new_name,
117 const VariableUse& new_column_use,
118 const ColumnType& new_type,
119 const Scaler& new_scaler,
120 const Tensor<string, 1>& new_categories,
121 const Tensor<VariableUse, 1>& new_categories_uses)
122{
123 name = new_name;
124 scaler = new_scaler;
125 column_use = new_column_use;
126 type = new_type;
127 categories = new_categories;
128 categories_uses = new_categories_uses;
129}
130
132
134{}
135
136
137void DataSet::Column::set_scaler(const Scaler& new_scaler)
138{
139 scaler = new_scaler;
140}
141
142
143void DataSet::Column::set_scaler(const string& new_scaler)
144{
145 if(new_scaler == "NoScaling")
146 {
147 set_scaler(Scaler::NoScaling);
148 }
149 else if(new_scaler == "MinimumMaximum")
150 {
151 set_scaler(Scaler::MinimumMaximum);
152 }
153 else if(new_scaler == "MeanStandardDeviation")
154 {
155 set_scaler(Scaler::MeanStandardDeviation);
156 }
157 else if(new_scaler == "StandardDeviation")
158 {
159 set_scaler(Scaler::StandardDeviation);
160 }
161 else if(new_scaler == "Logarithm")
162 {
163 set_scaler(Scaler::Logarithm);
164 }
165 else
166 {
167 ostringstream buffer;
168
169 buffer << "OpenNN Exception: DataSet class.\n"
170 << "void set_scaler(const string&) method.\n"
171 << "Unknown scaler: " << new_scaler << "\n";
172
173 throw logic_error(buffer.str());
174 }
175}
176
177
178
181
182void DataSet::Column::set_use(const VariableUse& new_column_use)
183{
184 column_use = new_column_use;
185
186 for(Index i = 0; i < categories_uses.size(); i++)
187 {
188 categories_uses(i) = new_column_use;
189 }
190}
191
192
195
196void DataSet::Column::set_use(const string& new_column_use)
197{
198 if(new_column_use == "Input")
199 {
200 set_use(VariableUse::Input);
201 }
202 else if(new_column_use == "Target")
203 {
204 set_use(VariableUse::Target);
205 }
206 else if(new_column_use == "Time")
207 {
208 set_use(VariableUse::Time);
209 }
210 else if(new_column_use == "Unused")
211 {
212 set_use(VariableUse::UnusedVariable);
213 }
214 else
215 {
216 ostringstream buffer;
217
218 buffer << "OpenNN Exception: DataSet class.\n"
219 << "void set_use(const string&) method.\n"
220 << "Unknown column use: " << new_column_use << "\n";
221
222 throw logic_error(buffer.str());
223 }
224}
225
226
229
230void DataSet::Column::set_type(const string& new_column_type)
231{
232 if(new_column_type == "Numeric")
233 {
234 type = ColumnType::Numeric;
235 }
236 else if(new_column_type == "Binary")
237 {
238 type = ColumnType::Binary;
239 }
240 else if(new_column_type == "Categorical")
241 {
242 type = ColumnType::Categorical;
243 }
244 else if(new_column_type == "DateTime")
245 {
246 type = ColumnType::DateTime;
247 }
248 else if(new_column_type == "Constant")
249 {
250 type = ColumnType::Constant;
251 }
252 else
253 {
254 ostringstream buffer;
255
256 buffer << "OpenNN Exception: DataSet class.\n"
257 << "void Column::set_type(const string&) method.\n"
258 << "Column type not valid (" << new_column_type << ").\n";
259
260 throw logic_error(buffer.str());
261
262 }
263}
264
265
269
270void DataSet::Column::add_category(const string & new_category)
271{
272 const Index old_categories_number = categories.size();
273
274 Tensor<string, 1> old_categories = categories;
275 Tensor<VariableUse, 1> old_categories_uses = categories_uses;
276
277 categories.resize(old_categories_number+1);
278 categories_uses.resize(old_categories_number+1);
279
280 for(Index category_index = 0; category_index < old_categories_number; category_index++)
281 {
282 categories(category_index) = old_categories(category_index);
283 categories_uses(category_index) = column_use;
284 }
285
286 categories(old_categories_number) = new_category;
287 categories_uses(old_categories_number) = column_use;
288}
289
290
293
294void DataSet::Column::set_categories_uses(const Tensor<string, 1>& new_categories_uses)
295{
296 const Index new_categories_uses_number = new_categories_uses.size();
297
298 categories_uses.resize(new_categories_uses_number);
299
300 for(Index i = 0; i < new_categories_uses.size(); i++)
301 {
302 if(new_categories_uses(i) == "Input")
303 {
304 categories_uses(i) = VariableUse::Input;
305 }
306 else if(new_categories_uses(i) == "Target")
307 {
308 categories_uses(i) = VariableUse::Target;
309 }
310 else if(new_categories_uses(i) == "Time")
311 {
312 categories_uses(i) = VariableUse::Time;
313 }
314 else if(new_categories_uses(i) == "Unused"
315 || new_categories_uses(i) == "UnusedVariable")
316 {
317 categories_uses(i) = VariableUse::UnusedVariable;
318 }
319 else
320 {
321 ostringstream buffer;
322
323 buffer << "OpenNN Exception: DataSet class.\n"
324 << "void Column::set_categories_uses(const Tensor<string, 1>&) method.\n"
325 << "Category use not valid (" << new_categories_uses(i) << ").\n";
326
327 throw logic_error(buffer.str());
328 }
329 }
330}
331
332
335
337{
338 categories_uses.setConstant(new_categories_use);
339}
340
341
342void DataSet::Column::from_XML(const tinyxml2::XMLDocument& column_document)
343{
344 ostringstream buffer;
345
346 // Name
347
348 const tinyxml2::XMLElement* name_element = column_document.FirstChildElement("Name");
349
350 if(!name_element)
351 {
352 buffer << "OpenNN Exception: DataSet class.\n"
353 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
354 << "Name element is nullptr.\n";
355
356 throw logic_error(buffer.str());
357 }
358
359 if(name_element->GetText())
360 {
361 const string new_name = name_element->GetText();
362
363 name = new_name;
364 }
365
366 // Scaler
367
368 const tinyxml2::XMLElement* scaler_element = column_document.FirstChildElement("Scaler");
369
370 if(!scaler_element)
371 {
372 buffer << "OpenNN Exception: DataSet class.\n"
373 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
374 << "Scaler element is nullptr.\n";
375
376 throw logic_error(buffer.str());
377 }
378
379 if(scaler_element->GetText())
380 {
381 const string new_scaler = scaler_element->GetText();
382
383 set_scaler(new_scaler);
384 }
385
386 // Column use
387
388 const tinyxml2::XMLElement* column_use_element = column_document.FirstChildElement("ColumnUse");
389
390 if(!column_use_element)
391 {
392 buffer << "OpenNN Exception: DataSet class.\n"
393 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
394 << "Column use element is nullptr.\n";
395
396 throw logic_error(buffer.str());
397 }
398
399 if(column_use_element->GetText())
400 {
401 const string new_column_use = column_use_element->GetText();
402
403 set_use(new_column_use);
404 }
405
406 // Type
407
408 const tinyxml2::XMLElement* type_element = column_document.FirstChildElement("Type");
409
410 if(!type_element)
411 {
412 buffer << "OpenNN Exception: DataSet class.\n"
413 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
414 << "Type element is nullptr.\n";
415
416 throw logic_error(buffer.str());
417 }
418
419 if(type_element->GetText())
420 {
421 const string new_type = type_element->GetText();
422 set_type(new_type);
423 }
424
425 if(type == ColumnType::Categorical)
426 {
427 // Categories
428
429 const tinyxml2::XMLElement* categories_element = column_document.FirstChildElement("Categories");
430
431 if(!categories_element)
432 {
433 buffer << "OpenNN Exception: DataSet class.\n"
434 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
435 << "Categories element is nullptr.\n";
436
437 throw logic_error(buffer.str());
438 }
439
440 if(categories_element->GetText())
441 {
442 const string new_categories = categories_element->GetText();
443
444 categories = get_tokens(new_categories, ';');
445 }
446
447 // Categories uses
448
449 const tinyxml2::XMLElement* categories_uses_element = column_document.FirstChildElement("CategoriesUses");
450
451 if(!categories_uses_element)
452 {
453 buffer << "OpenNN Exception: DataSet class.\n"
454 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
455 << "Categories uses element is nullptr.\n";
456
457 throw logic_error(buffer.str());
458 }
459
460 if(categories_uses_element->GetText())
461 {
462 const string new_categories_uses = categories_uses_element->GetText();
463
464 set_categories_uses(get_tokens(new_categories_uses, ';'));
465 }
466 }
467}
468
469
470void DataSet::Column::write_XML(tinyxml2::XMLPrinter& file_stream) const
471{
472 // Name
473
474 file_stream.OpenElement("Name");
475
476 file_stream.PushText(name.c_str());
477
478 file_stream.CloseElement();
479
480 // Scaler
481
482 file_stream.OpenElement("Scaler");
483
484 if(scaler == Scaler::NoScaling)
485 {
486 file_stream.PushText("NoScaling");
487 }
488 else if(scaler == Scaler::MinimumMaximum)
489 {
490 file_stream.PushText("MinimumMaximum");
491 }
492 else if(scaler == Scaler::MeanStandardDeviation)
493 {
494 file_stream.PushText("MeanStandardDeviation");
495 }
496 else if(scaler == Scaler::StandardDeviation)
497 {
498 file_stream.PushText("StandardDeviation");
499 }
500 else if(scaler == Scaler::Logarithm)
501 {
502 file_stream.PushText("Logarithm");
503 }
504
505 file_stream.CloseElement();
506
507 // Column use
508
509 file_stream.OpenElement("ColumnUse");
510
511 if(column_use == VariableUse::Input)
512 {
513 file_stream.PushText("Input");
514 }
515 else if(column_use == VariableUse::Target)
516 {
517 file_stream.PushText("Target");
518 }
519 else if(column_use == VariableUse::UnusedVariable)
520 {
521 file_stream.PushText("Unused");
522 }
523 else
524 {
525 file_stream.PushText("Time");
526 }
527
528 file_stream.CloseElement();
529
530 // Type
531
532 file_stream.OpenElement("Type");
533
534 if(type == ColumnType::Numeric)
535 {
536 file_stream.PushText("Numeric");
537 }
538 else if(type == ColumnType::Binary)
539 {
540 file_stream.PushText("Binary");
541 }
542 else if(type == ColumnType::Categorical)
543 {
544 file_stream.PushText("Categorical");
545 }
546 else if(type == ColumnType::Constant)
547 {
548 file_stream.PushText("Constant");
549 }
550 else
551 {
552 file_stream.PushText("DateTime");
553 }
554
555 file_stream.CloseElement();
556
557 if(type == ColumnType::Categorical || type == ColumnType::Binary)
558 {
559 if(categories.size() == 0) return;
560
561 // Categories
562
563 file_stream.OpenElement("Categories");
564
565 for(Index i = 0; i < categories.size(); i++)
566 {
567 file_stream.PushText(categories(i).c_str());
568
569 if(i != categories.size()-1)
570 {
571 file_stream.PushText(";");
572 }
573 }
574
575 file_stream.CloseElement();
576
577 // Categories uses
578
579 file_stream.OpenElement("CategoriesUses");
580
581 for(Index i = 0; i < categories_uses.size(); i++)
582 {
583 if(categories_uses(i) == VariableUse::Input)
584 {
585 file_stream.PushText("Input");
586 }
587 else if(categories_uses(i) == VariableUse::Target)
588 {
589 file_stream.PushText("Target");
590 }
591 else if(categories_uses(i) == VariableUse::Time)
592 {
593 file_stream.PushText("Time");
594 }
595 else
596 {
597 file_stream.PushText("Unused");
598 }
599
600 if(i != categories_uses.size()-1)
601 {
602 file_stream.PushText(";");
603 }
604 }
605
606 file_stream.CloseElement();
607 }
608}
609
610
611void DataSet::Column::print() const
612{
613 cout << "Name: " << name << endl;
614
615 cout << "Column use: ";
616
617 switch (column_use)
618 {
619 case VariableUse::Input:
620 {
621 cout << "Input" << endl;
622 }
623 break;
624
625 case VariableUse::Target:
626 {
627 cout << "Target" << endl;
628
629 }
630 break;
631
632 case VariableUse::UnusedVariable:
633 {
634 cout << "Unused" << endl;
635
636 }
637 break;
638
639 case VariableUse::Time:
640 {
641 cout << "Time" << endl;
642
643 }
644 break;
645
646 case VariableUse::Id:
647 {
648 cout << "Id" << endl;
649 }
650 break;
651 }
652
653 cout << "Column type: ";
654
655 switch (type)
656 {
657 case ColumnType::Numeric:
658 {
659 cout << "Numeric" << endl;
660 }
661 break;
662
663 case ColumnType::Binary:
664 {
665 cout << "Binary" << endl;
666
667 cout << "Categories: " << categories << endl;
668 }
669 break;
670
671 case ColumnType::Categorical:
672 {
673 cout << "Categorical" << endl;
674
675 cout << "Categories: " << categories << endl;
676 }
677 break;
678
679 case ColumnType::DateTime:
680 {
681 cout << "DateTime" << endl;
682
683 }
684 break;
685
686 case ColumnType::Constant:
687 {
688 cout << "Constant" << endl;
689 }
690 break;
691 }
692
693 cout << "Scaler: ";
694
695 switch (scaler)
696 {
697 case Scaler::NoScaling:
698 cout << "NoScaling" << endl;
699 break;
700
701 case Scaler::MinimumMaximum:
702 cout << "MinimumMaximum" << endl;
703 break;
704
705 case Scaler::MeanStandardDeviation:
706 cout << "MeanStandardDeviation" << endl;
707 break;
708
709 case Scaler::StandardDeviation:
710 cout << "StandardDeviation" << endl;
711 break;
712
713 case Scaler::Logarithm:
714 cout << "Logarithm" << endl;
715 break;
716 }
717}
718
719
720Index DataSet::Column::get_variables_number() const
721{
722 if(type == ColumnType::Categorical)
723 {
724 return categories.size();
725 }
726 else
727 {
728 return 1;
729 }
730}
731
732
734
736{
737 return categories.size();
738}
739
740
742
744{
745 Index used_categories_number = 0;
746
747 for(Index i = 0; i < categories.size(); i++)
748 {
749 if(categories_uses(i) != VariableUse::UnusedVariable) used_categories_number++;
750 }
751
752 return used_categories_number;
753}
754
755
757
759{
760 Tensor<string, 1> used_variables_names;
761
762 if(type != ColumnType::Categorical && column_use != VariableUse::UnusedVariable)
763 {
764 used_variables_names.resize(1);
765 used_variables_names.setConstant(name);
766 }
767 else if(type == ColumnType::Categorical)
768 {
769 used_variables_names.resize(get_used_categories_number());
770
771 Index category_index = 0;
772
773 for(Index i = 0; i < categories.size(); i++)
774 {
775 if(categories_uses(i) != VariableUse::UnusedVariable)
776 {
777 used_variables_names(category_index) = categories(i);
778
779 category_index++;
780 }
781 }
782 }
783
784 return used_variables_names;
785}
786
787
789
791{
792 // Categorical columns?
793
794 time_series_columns = columns;
795
796 const Index columns_number = get_columns_number();
797
798 Tensor<Column, 1> new_columns;
799
800 if(has_time_columns())
801 {
802 // @todo check if there are more than one time column
803
804 new_columns.resize((columns_number-1)*(lags_number+steps_ahead));
805 }
806 else
807 {
808 new_columns.resize(columns_number*(lags_number+steps_ahead));
809 }
810
811 Index lag_index = lags_number - 1;
812 Index ahead_index = 0;
813 Index column_index = 0;
814 Index new_column_index = 0;
815
816 for(Index i = 0; i < columns_number*(lags_number+steps_ahead); i++)
817 {
818 column_index = i%columns_number;
819
820 if(time_series_columns(column_index).type == ColumnType::DateTime) continue;
821
822 if(i < lags_number*columns_number)
823 {
824 new_columns(new_column_index).name = columns(column_index).name + "_lag_" + to_string(lag_index);
825 new_columns(new_column_index).set_use(VariableUse::Input);
826
827 new_columns(new_column_index).type = columns(column_index).type;
828 new_columns(new_column_index).categories = columns(column_index).categories;
829 new_columns(new_column_index).categories_uses = columns(column_index).categories_uses;
830
831 new_column_index++;
832 }
833 else
834 {
835 new_columns(new_column_index).name = columns(column_index).name + "_ahead_" + to_string(ahead_index);
836
837 new_columns(new_column_index).type = columns(column_index).type;
838 new_columns(new_column_index).categories = columns(column_index).categories;
839
840 if(new_columns(new_column_index).type == ColumnType::Constant)
841 {
842 new_columns(new_column_index).set_use(VariableUse::UnusedVariable);
843 new_columns(new_column_index).categories_uses.resize(columns(column_index).get_categories_number());
844 new_columns(new_column_index).categories_uses.setConstant(VariableUse::UnusedVariable);
845 }
846 else
847 {
848 new_columns(new_column_index).set_use(VariableUse::Target);
849 new_columns(new_column_index).categories_uses.resize(columns(column_index).get_categories_number());
850 new_columns(new_column_index).categories_uses.setConstant(VariableUse::Target);
851 }
852
853 new_column_index++;
854 }
855
856 if(lag_index > 0 && column_index == columns_number - 1)
857 {
858 lag_index--;
859 }
860 else if(column_index == columns_number - 1)
861 {
862 ahead_index++;
863 }
864 }
865
866 columns = new_columns;
867}
868
869
870void DataSet::transform_time_series_data()
871{
872 // Categorical / Time columns?
873
874 const Index old_samples_number = data.dimension(0);
875 const Index old_variables_number = data.dimension(1);
876
877 const Index new_samples_number = old_samples_number - (lags_number + steps_ahead - 1);
878 const Index new_variables_number = has_time_columns() ? (old_variables_number-1) * (lags_number + steps_ahead) : old_variables_number * (lags_number + steps_ahead);
879
881
882 data.resize(new_samples_number, new_variables_number);
883
884 Index index = 0;
885
886 for(Index j = 0; j < old_variables_number; j++)
887 {
888 if(columns(get_column_index(j)).type == ColumnType::DateTime)
889 {
890 index++;
891 continue;
892 }
893
894 for(Index i = 0; i < lags_number+steps_ahead; i++)
895 {
896 memcpy(data.data() + i*(old_variables_number-index)*new_samples_number + (j-index)*new_samples_number,
897 time_series_data.data() + i + j*old_samples_number,
898 static_cast<size_t>(old_samples_number-lags_number-steps_ahead+1)*sizeof(type));
899 }
900 }
901
902 samples_uses.resize(new_samples_number);
903}
904
905
909
910bool DataSet::is_sample_used(const Index& index) const
911{
912 if(samples_uses(index) == SampleUse::UnusedSample)
913 {
914 return false;
915 }
916 else
917 {
918 return true;
919 }
920}
921
922
925
926bool DataSet::is_sample_unused(const Index& index) const
927{
928 if(samples_uses(index) == SampleUse::UnusedSample)
929 {
930 return true;
931 }
932 else
933 {
934 return false;
935 }
936}
937
938
942
943Tensor<Index, 1> DataSet::get_samples_uses_numbers() const
944{
945 Tensor<Index, 1> count(4);
946
947 const Index samples_number = get_samples_number();
948
949 for(Index i = 0; i < samples_number; i++)
950 {
951 if(samples_uses(i) == SampleUse::Training)
952 {
953 count(0)++;
954 }
955 else if(samples_uses(i) == SampleUse::Selection)
956 {
957 count(1)++;
958 }
959 else if(samples_uses(i) == SampleUse::Testing)
960 {
961 count(2)++;
962 }
963 else
964 {
965 count(3)++;
966 }
967 }
968
969 return count;
970}
971
972
976
978{
979 const Index samples_number = get_samples_number();
980 const Index training_samples_number = get_training_samples_number();
981 const Index selection_samples_number = get_selection_samples_number();
982 const Index testing_samples_number = get_testing_samples_number();
983 const Index unused_samples_number = get_unused_samples_number();
984
985 const type training_samples_percentage = type(training_samples_number*100)/static_cast<type>(samples_number);
986 const type selection_samples_percentage = type(selection_samples_number*100)/static_cast<type>(samples_number);
987 const type testing_samples_percentage = type(testing_samples_number*100)/static_cast<type>(samples_number);
988 const type unused_samples_percentage = type(unused_samples_number*100)/static_cast<type>(samples_number);
989
990 Tensor<type, 1> samples_uses_percentage(4);
991
992 samples_uses_percentage.setValues({training_samples_percentage,
993 selection_samples_percentage,
994 testing_samples_percentage,
995 unused_samples_percentage});
996
997 return samples_uses_percentage;
998}
999
1000
1005
1006string DataSet::get_sample_string(const Index& sample_index, const string& separator) const
1007{
1008 const Tensor<type, 1> sample = data.chip(sample_index, 0);
1009
1010 string sample_string = "";
1011
1012 const Index columns_number = get_columns_number();
1013
1014 Index variable_index = 0;
1015
1016 for(Index i = 0; i < columns_number; i++)
1017 {
1018 if(columns(i).type == ColumnType::Numeric)
1019 {
1020 if(isnan(data(sample_index, variable_index))) sample_string += missing_values_label;
1021 else sample_string += to_string(double(data(sample_index, variable_index)));
1022
1023 variable_index++;
1024 }
1025 else if(columns(i).type == ColumnType::Binary)
1026 {
1027 if(isnan(data(sample_index, variable_index))) sample_string += missing_values_label;
1028 else sample_string += columns(i).categories(static_cast<Index>(data(sample_index, variable_index)));
1029
1030 variable_index++;
1031 }
1032 else if(columns(i).type == ColumnType::DateTime)
1033 {
1034 // @todo do something
1035
1036 if(isnan(data(sample_index, variable_index))) sample_string += missing_values_label;
1037 else sample_string += to_string(double(data(sample_index, variable_index)));
1038
1039 variable_index++;
1040 }
1041 else if(columns(i).type == ColumnType::Categorical)
1042 {
1043 if(isnan(data(sample_index, variable_index)))
1044 {
1045 sample_string += missing_values_label;
1046 }
1047 else
1048 {
1049 const Index categories_number = columns(i).get_categories_number();
1050
1051 for(Index j = 0; j < categories_number; j++)
1052 {
1053 if(abs(data(sample_index, variable_index+j) - static_cast<type>(1)) < type(NUMERIC_LIMITS_MIN))
1054 {
1055 sample_string += columns(i).categories(j);
1056 break;
1057 }
1058 }
1059
1060 variable_index += categories_number;
1061 }
1062 }
1063
1064 if(i != columns_number-1) sample_string += separator + " ";
1065 }
1066
1067 return sample_string;
1068}
1069
1070
1072
1074{
1075 const Index samples_number = get_samples_number();
1076
1077 const Index training_samples_number = get_training_samples_number();
1078
1079 Tensor<Index, 1> training_indices(training_samples_number);
1080
1081 Index count = 0;
1082
1083 for(Index i = 0; i < samples_number; i++)
1084 {
1085 if(samples_uses(i) == SampleUse::Training)
1086 {
1087 training_indices(count) = i;
1088 count++;
1089 }
1090 }
1091
1092 return training_indices;
1093}
1094
1095
1097
1099{
1100 const Index samples_number = get_samples_number();
1101
1102 const Index selection_samples_number = get_selection_samples_number();
1103
1104 Tensor<Index, 1> selection_indices(selection_samples_number);
1105
1106 Index count = 0;
1107
1108 for(Index i = 0; i < samples_number; i++)
1109 {
1110 if(samples_uses(i) == SampleUse::Selection)
1111 {
1112 selection_indices(count) = i;
1113 count++;
1114 }
1115 }
1116
1117 return selection_indices;
1118}
1119
1120
1122
1124{
1125 const Index samples_number = get_samples_number();
1126
1127 const Index testing_samples_number = get_testing_samples_number();
1128
1129 Tensor<Index, 1> testing_indices(testing_samples_number);
1130
1131 Index count = 0;
1132
1133 for(Index i = 0; i < samples_number; i++)
1134 {
1135 if(samples_uses(i) == SampleUse::Testing)
1136 {
1137 testing_indices(count) = i;
1138 count++;
1139 }
1140 }
1141
1142 return testing_indices;
1143}
1144
1145
1147
1149{
1150 const Index samples_number = get_samples_number();
1151
1152 const Index used_samples_number = samples_number - get_unused_samples_number();
1153
1154 Tensor<Index, 1> used_indices(used_samples_number);
1155
1156 Index index = 0;
1157
1158 for(Index i = 0; i < samples_number; i++)
1159 {
1160 if(samples_uses(i) != SampleUse::UnusedSample)
1161 {
1162 used_indices(index) = i;
1163 index++;
1164 }
1165 }
1166
1167 return used_indices;
1168}
1169
1170
1172
1174{
1175 const Index samples_number = get_samples_number();
1176
1177 const Index unused_samples_number = get_unused_samples_number();
1178
1179 Tensor<Index, 1> unused_indices(unused_samples_number);
1180
1181 Index count = 0;
1182
1183 for(Index i = 0; i < samples_number; i++)
1184 {
1185 if(samples_uses(i) == SampleUse::UnusedSample)
1186 {
1187 unused_indices(count) = i;
1188 count++;
1189 }
1190 }
1191
1192 return unused_indices;
1193}
1194
1195
1198
1200{
1201 return samples_uses(index);
1202}
1203
1204
1206
1207const Tensor<DataSet::SampleUse,1 >& DataSet::get_samples_uses() const
1208{
1209 return samples_uses;
1210}
1211
1212
1216
1217Tensor<Index, 2> DataSet::get_batches(const Tensor<Index,1>& samples_indices,
1218 const Index& batch_samples_number,
1219 const bool& shuffle,
1220 const Index& new_buffer_size) const
1221{
1222 if(!shuffle) return split_samples(samples_indices, batch_samples_number);
1223
1224 std::random_device rng;
1225 std::mt19937 urng(rng());
1226
1227 const Index samples_number = samples_indices.size();
1228
1229 Index buffer_size = new_buffer_size;
1230 Index batches_number;
1231 Index batch_size = batch_samples_number;
1232
1233 // When samples_number is less than 100 (small sample)
1234
1235 if(buffer_size > samples_number)
1236 {
1237 buffer_size = samples_number;
1238 }
1239
1240 // Check batch size and samples number
1241
1242 if(samples_number < batch_size)
1243 {
1244 batches_number = 1;
1245 batch_size = samples_number;
1246 buffer_size = batch_size;
1247
1248 Tensor<Index,1> samples_copy(samples_indices);
1249
1250 Tensor<Index, 2> batches(batches_number, batch_size);
1251
1252 // Shuffle
1253
1254 std::shuffle(samples_copy.data(), samples_copy.data() + samples_copy.size(), urng);
1255
1256 for(Index i = 0; i > batch_size; i++)
1257 {
1258 batches(0,i) = samples_copy(i);
1259
1260 }
1261 return batches;
1262
1263 }
1264 else
1265 {
1266 batches_number = samples_number / batch_size;
1267 }
1268
1269 Tensor<Index, 2> batches(batches_number, batch_size);
1270
1271 Tensor<Index, 1> buffer(buffer_size);
1272
1273 for(Index i = 0; i < buffer_size; i++) buffer(i) = i;
1274
1275 Index next_index = buffer_size;
1276 Index random_index = 0;
1277
1278 // Heuristic cases for batch shuffling
1279
1280 if(batch_size < buffer_size)
1281 {
1282 Index diff = buffer_size/ batch_size;
1283
1284 // Main Loop
1285
1286 for(Index i = 0; i < batches_number; i++)
1287 {
1288 // Last batch
1289
1290 if(i == batches_number-diff)
1291 {
1292 Index buffer_index = 0;
1293
1294 for(Index k = batches_number-diff; k < batches_number; k++)
1295 {
1296 for(Index j = 0; j < batch_size; j++)
1297 {
1298 batches(k,j) = buffer(buffer_index);
1299
1300 buffer_index++;
1301 }
1302 }
1303
1304 break;
1305 }
1306
1307 // Shuffle batches
1308
1309 for(Index j = 0; j < batch_size; j++)
1310 {
1311 random_index = static_cast<Index>(rand()%buffer_size);
1312
1313 batches(i, j) = buffer(random_index);
1314
1315 buffer(random_index) = samples_indices(next_index);
1316
1317 next_index++;
1318 }
1319 }
1320
1321 return batches;
1322 }
1323 else // buffer_size <= batch_size
1324 {
1325 // Main Loop
1326
1327 for(Index i = 0; i < batches_number; i++)
1328 {
1329 // Last batch
1330
1331 if(i == batches_number-1)
1332 {
1333 std::shuffle(buffer.data(), buffer.data() + buffer.size(), urng);
1334
1335 if(batch_size <= buffer_size)
1336 {
1337 for(Index j = 0; j < batch_size;j++)
1338 {
1339 batches(i,j) = buffer(j);
1340 }
1341 }
1342 else //buffer_size < batch_size
1343 {
1344 for(Index j = 0; j < buffer_size; j++)
1345 {
1346 batches(i,j) = buffer(j);
1347 }
1348
1349 for(Index j = buffer_size; j < batch_size; j++)
1350 {
1351 batches(i,j) = samples_indices(next_index);
1352
1353 next_index++;
1354 }
1355 }
1356
1357 break;
1358 }
1359
1360 // Shuffle batches
1361
1362 for(Index j = 0; j < batch_size; j++)
1363 {
1364 random_index = static_cast<Index>(rand()%buffer_size);
1365
1366 batches(i, j) = buffer(random_index);
1367
1368 buffer(random_index) = samples_indices(next_index);
1369
1370 next_index++;
1371
1372 }
1373 }
1374
1375 return batches;
1376 }
1377}
1378
1379
1381
1383{
1384 const Index samples_number = get_samples_number();
1385
1386 Index training_samples_number = 0;
1387
1388 for(Index i = 0; i < samples_number; i++)
1389 {
1390 if(samples_uses(i) == SampleUse::Training)
1391 {
1392 training_samples_number++;
1393 }
1394 }
1395
1396 return training_samples_number;
1397}
1398
1399
1401
1403{
1404 const Index samples_number = get_samples_number();
1405
1406 Index selection_samples_number = 0;
1407
1408 for(Index i = 0; i < samples_number; i++)
1409 {
1410 if(samples_uses(i) == SampleUse::Selection)
1411 {
1412 selection_samples_number++;
1413 }
1414 }
1415
1416 return selection_samples_number;
1417}
1418
1419
1421
1423{
1424 const Index samples_number = get_samples_number();
1425
1426 Index testing_samples_number = 0;
1427
1428 for(Index i = 0; i < samples_number; i++)
1429 {
1430 if(samples_uses(i) == SampleUse::Testing)
1431 {
1432 testing_samples_number++;
1433 }
1434 }
1435
1436 return testing_samples_number;
1437}
1438
1439
1442
1444{
1445 const Index samples_number = get_samples_number();
1446 const Index unused_samples_number = get_unused_samples_number();
1447
1448 return (samples_number - unused_samples_number);
1449}
1450
1451
1454
1456{
1457 const Index samples_number = get_samples_number();
1458
1459 Index unused_samples_number = 0;
1460
1461 for(Index i = 0; i < samples_number; i++)
1462 {
1463 if(samples_uses(i) == SampleUse::UnusedSample)
1464 {
1465 unused_samples_number++;
1466 }
1467 }
1468
1469 return unused_samples_number;
1470}
1471
1472
1474
1476{
1477 const Index samples_number = get_samples_number();
1478
1479 for(Index i = 0; i < samples_number; i++)
1480 {
1481 samples_uses(i) = SampleUse::Training;
1482 }
1483}
1484
1485
1487
1489{
1490 const Index samples_number = get_samples_number();
1491
1492 for(Index i = 0; i < samples_number; i++)
1493 {
1494 samples_uses(i) = SampleUse::Selection;
1495 }
1496}
1497
1498
1500
1502{
1503 const Index samples_number = get_samples_number();
1504
1505 for(Index i = 0; i < samples_number; i++)
1506 {
1507 samples_uses(i) = SampleUse::Testing;
1508 }
1509}
1510
1511
1514
1515void DataSet::set_training(const Tensor<Index, 1>& indices)
1516{
1517 Index index = 0;
1518
1519 for(Index i = 0; i < indices.size(); i++)
1520 {
1521 index = indices(i);
1522
1523 samples_uses(index) = SampleUse::Training;
1524 }
1525}
1526
1527
1530
1531void DataSet::set_selection(const Tensor<Index, 1>& indices)
1532{
1533 Index index = 0;
1534
1535 for(Index i = 0; i < indices.size(); i++)
1536 {
1537 index = indices(i);
1538
1539 samples_uses(index) = SampleUse::Selection;
1540 }
1541}
1542
1543
1546
1547void DataSet::set_testing(const Tensor<Index, 1>& indices)
1548{
1549 Index index = 0;
1550
1551 for(Index i = 0; i < indices.size(); i++)
1552 {
1553 index = indices(i);
1554
1555 samples_uses(index) = SampleUse::Testing;
1556 }
1557}
1558
1559
1561
1563{
1564 const Index samples_number = get_samples_number();
1565
1566 for(Index i = 0; i < samples_number; i++)
1567 {
1568 samples_uses(i) = SampleUse::UnusedSample;
1569 }
1570}
1571
1572
1575
1576void DataSet::set_samples_unused(const Tensor<Index, 1>& indices)
1577{
1578 for(Index i = 0; i < static_cast<Index>(indices.size()); i++)
1579 {
1580 const Index index = indices(i);
1581
1582 samples_uses(index) = SampleUse::UnusedSample;
1583 }
1584}
1585
1586
1590
1591void DataSet::set_sample_use(const Index& index, const SampleUse& new_use)
1592{
1593 samples_uses(index) = new_use;
1594
1595}
1596
1597
1601
1602void DataSet::set_sample_use(const Index& index, const string& new_use)
1603{
1604 if(new_use == "Training")
1605 {
1606 samples_uses(index) = SampleUse::Training;
1607 }
1608 else if(new_use == "Selection")
1609 {
1610 samples_uses(index) = SampleUse::Selection;
1611 }
1612 else if(new_use == "Testing")
1613 {
1614 samples_uses(index) = SampleUse::Testing;
1615 }
1616 else if(new_use == "Unused")
1617 {
1618 samples_uses(index) = SampleUse::UnusedSample;
1619 }
1620 else
1621 {
1622 ostringstream buffer;
1623
1624 buffer << "OpenNN Exception: DataSet class.\n"
1625 << "void set_sample_use(const string&) method.\n"
1626 << "Unknown sample use: " << new_use << "\n";
1627
1628 throw logic_error(buffer.str());
1629 }
1630}
1631
1632
1636
1637void DataSet::set_samples_uses(const Tensor<SampleUse, 1>& new_uses)
1638{
1639 const Index samples_number = get_samples_number();
1640
1641#ifdef OPENNN_DEBUG
1642
1643 const Index new_uses_size = new_uses.size();
1644
1645 if(new_uses_size != samples_number)
1646 {
1647 ostringstream buffer;
1648
1649 buffer << "OpenNN Exception: DataSet class.\n"
1650 << "void set_samples_uses(const Tensor<SampleUse, 1>&) method.\n"
1651 << "Size of uses(" << new_uses_size << ") must be equal to number of samples(" << samples_number << ").\n";
1652
1653 throw logic_error(buffer.str());
1654 }
1655
1656#endif
1657
1658 for(Index i = 0; i < samples_number; i++)
1659 {
1660 samples_uses(i) = new_uses(i);
1661 }
1662}
1663
1664
1669
1670void DataSet::set_samples_uses(const Tensor<string, 1>& new_uses)
1671{
1672 const Index samples_number = get_samples_number();
1673
1674 ostringstream buffer;
1675
1676#ifdef OPENNN_DEBUG
1677
1678 const Index new_uses_size = new_uses.size();
1679
1680 if(new_uses_size != samples_number)
1681 {
1682 buffer << "OpenNN Exception: DataSet class.\n"
1683 << "void set_samples_uses(const Tensor<string, 1>&) method.\n"
1684 << "Size of uses(" << new_uses_size << ") must be equal to number of samples(" << samples_number << ").\n";
1685
1686 throw logic_error(buffer.str());
1687 }
1688
1689#endif
1690
1691 for(Index i = 0; i < samples_number; i++)
1692 {
1693 if(new_uses(i).compare("Training") == 0 || new_uses(i).compare("0") == 0)
1694 {
1695 samples_uses(i) = SampleUse::Training;
1696 }
1697 else if(new_uses(i).compare("Selection") == 0 || new_uses(i).compare("1") == 0)
1698 {
1699 samples_uses(i) = SampleUse::Selection;
1700 }
1701 else if(new_uses(i).compare("Testing") == 0 || new_uses(i).compare("2") == 0)
1702 {
1703 samples_uses(i) = SampleUse::Testing;
1704 }
1705 else if(new_uses(i).compare("Unused") == 0 || new_uses(i).compare("3") == 0)
1706 {
1707 samples_uses(i) = SampleUse::UnusedSample;
1708 }
1709 else
1710 {
1711 buffer << "OpenNN Exception: DataSet class.\n"
1712 << "void set_samples_uses(const Tensor<string, 1>&) method.\n"
1713 << "Unknown sample use: " << new_uses(i) << ".\n";
1714
1715 throw logic_error(buffer.str());
1716 }
1717 }
1718}
1719
1720
1725
1726void DataSet::split_samples_random(const type& training_samples_ratio,
1727 const type& selection_samples_ratio,
1728 const type& testing_samples_ratio)
1729{
1730 std::random_device rng;
1731 std::mt19937 urng(rng());
1732
1733 const Index used_samples_number = get_used_samples_number();
1734
1735 if(used_samples_number == 0) return;
1736
1737 const type total_ratio = training_samples_ratio + selection_samples_ratio + testing_samples_ratio;
1738
1739 // Get number of samples for training, selection and testing
1740
1741 const Index selection_samples_number = static_cast<Index>(selection_samples_ratio*type(used_samples_number)/type(total_ratio));
1742 const Index testing_samples_number = static_cast<Index>(testing_samples_ratio* type(used_samples_number)/ type(total_ratio));
1743 const Index training_samples_number = used_samples_number - selection_samples_number - testing_samples_number;
1744
1745 const Index sum_samples_number = training_samples_number + selection_samples_number + testing_samples_number;
1746
1747 if(sum_samples_number != used_samples_number)
1748 {
1749 ostringstream buffer;
1750
1751 buffer << "OpenNN Warning: DataSet class.\n"
1752 << "void split_samples_random(const type&, const type&, const type&) method.\n"
1753 << "Sum of numbers of training, selection and testing samples is not equal to number of used samples.\n";
1754
1755 throw logic_error(buffer.str());
1756 }
1757
1758 const Index samples_number = get_samples_number();
1759
1760 Tensor<Index, 1> indices;
1761
1762 initialize_sequential(indices, 0, 1, samples_number-1);
1763
1764 std::shuffle(indices.data(), indices.data() + indices.size(), urng);
1765
1766 Index count = 0;
1767
1768 for(Index i = 0; i < samples_uses.size(); i++)
1769 {
1770 if(samples_uses(i) == SampleUse::UnusedSample) count ++;
1771 }
1772
1773 Index i = 0;
1774 Index index;
1775
1776 // Training
1777
1778 Index count_training = 0;
1779
1780 while(count_training != training_samples_number)
1781 {
1782 index = indices(i);
1783
1784 if(samples_uses(index) != SampleUse::UnusedSample)
1785 {
1786 samples_uses(index)= SampleUse::Training;
1787 count_training++;
1788 }
1789
1790 i++;
1791 }
1792
1793 // Selection
1794
1795 Index count_selection = 0;
1796
1797 while(count_selection != selection_samples_number)
1798 {
1799 index = indices(i);
1800
1801 if(samples_uses(index) != SampleUse::UnusedSample)
1802 {
1803 samples_uses(index) = SampleUse::Selection;
1804 count_selection++;
1805 }
1806
1807 i++;
1808 }
1809
1810 // Testing
1811
1812 Index count_testing = 0;
1813
1814 while(count_testing != testing_samples_number)
1815 {
1816 index = indices(i);
1817
1818 if(samples_uses(index) != SampleUse::UnusedSample)
1819 {
1820 samples_uses(index) = SampleUse::Testing;
1821 count_testing++;
1822 }
1823
1824 i++;
1825 }
1826}
1827
1828
1833
1834void DataSet::split_samples_sequential(const type& training_samples_ratio,
1835 const type& selection_samples_ratio,
1836 const type& testing_samples_ratio)
1837{
1838 const Index used_samples_number = get_used_samples_number();
1839
1840 if(used_samples_number == 0) return;
1841
1842 const type total_ratio = training_samples_ratio + selection_samples_ratio + testing_samples_ratio;
1843
1844 // Get number of samples for training, selection and testing
1845
1846 const Index selection_samples_number = static_cast<Index>(selection_samples_ratio* type(used_samples_number)/ type(total_ratio));
1847 const Index testing_samples_number = static_cast<Index>(testing_samples_ratio* type(used_samples_number)/ type(total_ratio));
1848 const Index training_samples_number = used_samples_number - selection_samples_number - testing_samples_number;
1849
1850 const Index sum_samples_number = training_samples_number + selection_samples_number + testing_samples_number;
1851
1852 if(sum_samples_number != used_samples_number)
1853 {
1854 ostringstream buffer;
1855
1856 buffer << "OpenNN Warning: Samples class.\n"
1857 << "void split_samples_sequential(const type&, const type&, const type&) method.\n"
1858 << "Sum of numbers of training, selection and testing samples is not equal to number of used samples.\n";
1859
1860 throw logic_error(buffer.str());
1861 }
1862
1863 Index i = 0;
1864
1865 // Training
1866
1867 Index count_training = 0;
1868
1869 while(count_training != training_samples_number)
1870 {
1871 if(samples_uses(i) != SampleUse::UnusedSample)
1872 {
1873 samples_uses(i) = SampleUse::Training;
1874 count_training++;
1875 }
1876
1877 i++;
1878 }
1879
1880 // Selection
1881
1882 Index count_selection = 0;
1883
1884 while(count_selection != selection_samples_number)
1885 {
1886 if(samples_uses(i) != SampleUse::UnusedSample)
1887 {
1888 samples_uses(i) = SampleUse::Selection;
1889 count_selection++;
1890 }
1891
1892 i++;
1893 }
1894
1895 // Testing
1896
1897 Index count_testing = 0;
1898
1899 while(count_testing != testing_samples_number)
1900 {
1901 if(samples_uses(i) != SampleUse::UnusedSample)
1902 {
1903 samples_uses(i) = SampleUse::Testing;
1904 count_testing++;
1905 }
1906
1907 i++;
1908 }
1909}
1910
1911
1912void DataSet::set_columns(const Tensor<Column, 1>& new_columns)
1913{
1914 columns = new_columns;
1915}
1916
1917
1920
1922{
1923 const Index columns_number = columns.size();
1924
1925 bool target = false;
1926
1927 if(columns_number == 0)
1928 {
1929 return;
1930 }
1931
1932 else if(columns_number == 1)
1933 {
1934 columns(0).set_use(VariableUse::UnusedVariable);
1935 }
1936
1937 else
1938 {
1939 set_input();
1940
1941 for(Index i = columns.size()-1; i >= 0; i--)
1942 {
1943 if(columns(i).type == ColumnType::Constant || columns(i).type == ColumnType::DateTime)
1944 {
1945 columns(i).set_use(VariableUse::UnusedVariable);
1946 continue;
1947 }
1948
1949 if(!target)
1950 {
1951 columns(i).set_use(VariableUse::Target);
1952
1953 target = true;
1954
1955 continue;
1956 }
1957 }
1958
1959 input_variables_dimensions.resize(1);
1960 }
1961}
1962
1963
1967
1969{
1970 const Index columns_number = columns.size();
1971
1972 for(Index i = 0; i < columns_number; i++)
1973 {
1974 columns(i).name = "column_" + to_string(1+i);
1975 }
1976}
1977
1978
1982
1983void DataSet::set_column_name(const Index& column_index, const string& new_name)
1984{
1985 columns(column_index).name = new_name;
1986}
1987
1988
1991
1993{
1994 return get_variables_uses()(index);
1995}
1996
1997
1999
2001{
2002 return columns(index).column_use;
2003}
2004
2005
2007
2008Tensor<DataSet::VariableUse, 1> DataSet::get_columns_uses() const
2009{
2010 const Index columns_number = get_columns_number();
2011
2012 Tensor<DataSet::VariableUse, 1> columns_uses(columns_number);
2013
2014 for(Index i = 0; i < columns_number; i++)
2015 {
2016 columns_uses(i) = columns(i).column_use;
2017 }
2018
2019 return columns_uses;
2020}
2021
2022
2025
2026Tensor<DataSet::VariableUse, 1> DataSet::get_variables_uses() const
2027{
2028 const Index columns_number = get_columns_number();
2029 const Index variables_number = get_variables_number();
2030
2031 Tensor<VariableUse, 1> variables_uses(variables_number);
2032
2033 Index index = 0;
2034
2035 for(Index i = 0; i < columns_number; i++)
2036 {
2037 if(columns(i).type == ColumnType::Categorical)
2038 {
2039 for(Index i = 0; i < (columns(i).categories_uses).size(); i++)
2040 {
2041 variables_uses(i + index) = (columns(i).categories_uses)(i);
2042 }
2043 index += columns(i).categories.size();
2044 }
2045 else
2046 {
2047 variables_uses(index) = columns(i).column_use;
2048 index++;
2049 }
2050 }
2051
2052 return variables_uses;
2053}
2054
2055
2058
2059string DataSet::get_variable_name(const Index& variable_index) const
2060{
2061#ifdef OPENNN_DEBUG
2062
2063 const Index variables_number = get_variables_number();
2064
2065 if(variable_index >= variables_number)
2066 {
2067 ostringstream buffer;
2068
2069 buffer << "OpenNN Exception: DataSet class.\n"
2070 << "string& get_variable_name(const Index) method.\n"
2071 << "Index of variable("<<variable_index<<") must be less than number of variables("<<variables_number<<").\n";
2072
2073 throw logic_error(buffer.str());
2074 }
2075
2076#endif
2077
2078 const Index columns_number = get_columns_number();
2079
2080 Index index = 0;
2081
2082 for(Index i = 0; i < columns_number; i++)
2083 {
2084 if(columns(i).type == ColumnType::Categorical)
2085 {
2086 for(Index j = 0; j < columns(i).get_categories_number(); j++)
2087 {
2088 if(index == variable_index)
2089 {
2090 return columns(i).categories(j);
2091 }
2092 else
2093 {
2094 index++;
2095 }
2096 }
2097 }
2098 else
2099 {
2100 if(index == variable_index)
2101 {
2102 return columns(i).name;
2103 }
2104 else
2105 {
2106 index++;
2107 }
2108 }
2109 }
2110
2111 return string();
2112}
2113
2114
2117
2118Tensor<string, 1> DataSet::get_variables_names() const
2119{
2120 const Index variables_number = get_variables_number();
2121
2122 Tensor<string, 1> variables_names(variables_number);
2123
2124 Index index = 0;
2125
2126 for(Index i = 0; i < columns.size(); i++)
2127 {
2128 if(columns(i).type == ColumnType::Categorical)
2129 {
2130 for(Index j = 0; j < columns(i).categories.size(); j++)
2131 {
2132 variables_names(index) = columns(i).categories(j);
2133
2134 index++;
2135 }
2136 }
2137 else
2138 {
2139 variables_names(index) = columns(i).name;
2140
2141 index++;
2142 }
2143 }
2144
2145 return variables_names;
2146}
2147
2150
2152{
2153 const Index variables_number = get_time_series_variables_number();
2154
2155 Tensor<string, 1> variables_names(variables_number);
2156
2157 Index index = 0;
2158
2159 for(Index i = 0; i < time_series_columns.size(); i++)
2160 {
2161 if(time_series_columns(i).type == ColumnType::Categorical)
2162 {
2163 for(Index j = 0; j < time_series_columns(i).categories.size(); j++)
2164 {
2165 variables_names(index) = time_series_columns(i).categories(j);
2166
2167 index++;
2168 }
2169 }
2170 else
2171 {
2172 variables_names(index) = time_series_columns(i).name;
2173 index++;
2174 }
2175 }
2176
2177 return variables_names;
2178}
2179
2180
2183
2184Tensor<string, 1> DataSet::get_input_variables_names() const
2185{
2186 const Index input_variables_number = get_input_variables_number();
2187
2188 const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
2189
2190 Tensor<string, 1> input_variables_names(input_variables_number);
2191
2192 Index index = 0;
2193
2194 for(Index i = 0; i < input_columns_indices.size(); i++)
2195 {
2196 Index input_index = input_columns_indices(i);
2197
2198 const Tensor<string, 1> current_used_variables_names = columns(input_index).get_used_variables_names();
2199
2200 for(Index j = 0; j < current_used_variables_names.size(); j++)
2201 {
2202 input_variables_names(index + j) = current_used_variables_names(j);
2203 }
2204
2205 index += current_used_variables_names.size();
2206 }
2207
2208 return input_variables_names;
2209}
2210
2211
2214
2216{
2217 const Index target_variables_number = get_target_variables_number();
2218
2219 const Tensor<Index, 1> target_columns_indices = get_target_columns_indices();
2220
2221 Tensor<string, 1> target_variables_names(target_variables_number);
2222
2223 Index index = 0;
2224
2225 for(Index i = 0; i < target_columns_indices.size(); i++)
2226 {
2227 const Index target_index = target_columns_indices(i);
2228
2229 const Tensor<string, 1> current_used_variables_names = columns(target_index).get_used_variables_names();
2230
2231 for(Index j = 0; j < current_used_variables_names.size(); j++)
2232 {
2233 target_variables_names(index + j) = current_used_variables_names(j);
2234 }
2235
2236 index += current_used_variables_names.size();
2237 }
2238
2239 return target_variables_names;
2240}
2241
2242
2244
2245const Tensor<Index, 1>& DataSet::get_input_variables_dimensions() const
2246{
2247 return input_variables_dimensions;
2248}
2249
2250
2251Index DataSet::get_input_variables_rank() const
2252{
2253 return input_variables_dimensions.rank();
2254}
2255
2256
2258
2260{
2261 const Index variables_number = get_variables_number();
2262
2263 const Index unused_variables_number = get_unused_variables_number();
2264
2265 return (variables_number - unused_variables_number);
2266}
2267
2268
2270
2272{
2273 const Index input_columns_number = get_input_columns_number();
2274
2275 Tensor<Index, 1> input_columns_indices(input_columns_number);
2276
2277 Index index = 0;
2278
2279 for(Index i = 0; i < columns.size(); i++)
2280 {
2281 if(columns(i).column_use == VariableUse::Input)
2282 {
2283 input_columns_indices(index) = i;
2284 index++;
2285 }
2286 }
2287
2288 return input_columns_indices;
2289}
2290
2291
2292Tensor<Index, 1> DataSet::get_input_time_series_columns_indices() const
2293{
2294 const Index input_columns_number = get_input_time_series_columns_number();
2295
2296 Tensor<Index, 1> input_columns_indices(input_columns_number);
2297
2298 Index index = 0;
2299
2300 for(Index i = 0; i < time_series_columns.size(); i++)
2301 {
2302 if(time_series_columns(i).column_use == VariableUse::Input)
2303 {
2304 input_columns_indices(index) = i;
2305 index++;
2306 }
2307 }
2308
2309 return input_columns_indices;
2310}
2311
2312
2314
2316{
2317 const Index target_columns_number = get_target_columns_number();
2318
2319 Tensor<Index, 1> target_columns_indices(target_columns_number);
2320
2321 Index index = 0;
2322
2323 for(Index i = 0; i < columns.size(); i++)
2324 {
2325 if(columns(i).column_use == VariableUse::Target)
2326 {
2327 target_columns_indices(index) = i;
2328 index++;
2329 }
2330 }
2331
2332 return target_columns_indices;
2333}
2334
2335
2336Tensor<Index, 1> DataSet::get_target_time_series_columns_indices() const
2337{
2338 const Index target_columns_number = get_target_time_series_columns_number();
2339
2340 Tensor<Index, 1> target_columns_indices(target_columns_number);
2341
2342 Index index = 0;
2343
2344 for(Index i = 0; i < time_series_columns.size(); i++)
2345 {
2346 if(time_series_columns(i).column_use == VariableUse::Target)
2347 {
2348 target_columns_indices(index) = i;
2349 index++;
2350 }
2351 }
2352
2353 return target_columns_indices;
2354}
2355
2356
2358
2360{
2361 const Index unused_columns_number = get_unused_columns_number();
2362
2363 Tensor<Index, 1> unused_columns_indices(unused_columns_number);
2364
2365 Index index = 0;
2366
2367 for(Index i = 0; i < unused_columns_number; i++)
2368 {
2369
2370 if(columns(i).column_use == VariableUse::UnusedVariable)
2371 {
2372 unused_columns_indices(index) = i;
2373 index++;
2374 }
2375 }
2376
2377 return unused_columns_indices;
2378}
2379
2380
2382
2384{
2385 const Index variables_number = get_variables_number();
2386
2387 const Index used_variables_number = get_used_variables_number();
2388
2389 Tensor<Index, 1> used_indices(used_variables_number);
2390
2391 Index index = 0;
2392
2393 for(Index i = 0; i < variables_number; i++)
2394 {
2395 if(columns(i).column_use == VariableUse::Input
2396 || columns(i).column_use == VariableUse::Target
2397 || columns(i).column_use == VariableUse::Time)
2398 {
2399 used_indices(index) = i;
2400 index++;
2401 }
2402 }
2403
2404 return used_indices;
2405}
2406
2407
2408Tensor<Scaler, 1> DataSet::get_columns_scalers() const
2409{
2410 const Index columns_number = get_columns_number();
2411
2412 Tensor<Scaler, 1> columns_scalers(columns_number);
2413
2414 for(Index i = 0; i < columns_number; i++)
2415 {
2416 columns_scalers(i) = columns(i).scaler;
2417 }
2418
2419 return columns_scalers;
2420}
2421
2422
2423Tensor<Scaler, 1> DataSet::get_input_variables_scalers() const
2424{
2425 const Index input_columns_number = get_input_columns_number();
2426 const Index input_variables_number = get_input_variables_number();
2427
2428 const Tensor<Column, 1> input_columns = get_input_columns();
2429
2430 Tensor<Scaler, 1> input_variables_scalers(input_variables_number);
2431
2432 Index index = 0;
2433
2434 for(Index i = 0; i < input_columns_number; i++)
2435 {
2436 for(Index j = 0; j < input_columns(i).get_variables_number(); j++)
2437 {
2438 input_variables_scalers(index) = input_columns(i).scaler;
2439 index++;
2440 }
2441 }
2442
2443 return input_variables_scalers;
2444}
2445
2446
2447Tensor<Scaler, 1> DataSet::get_target_variables_scalers() const
2448{
2449 const Index target_columns_number = get_target_columns_number();
2450 const Index target_variables_number = get_target_variables_number();
2451
2452 const Tensor<Column, 1> target_columns = get_target_columns();
2453
2454 Tensor<Scaler, 1> target_variables_scalers(target_variables_number);
2455
2456 Index index = 0;
2457
2458 for(Index i = 0; i < target_columns_number; i++)
2459 {
2460 for(Index j = 0; j < target_columns(i).get_variables_number(); j++)
2461 {
2462 target_variables_scalers(index) = target_columns(i).scaler;
2463 index++;
2464 }
2465 }
2466
2467 return target_variables_scalers;
2468}
2469
2470
2472
2473Tensor<string, 1> DataSet::get_columns_names() const
2474{
2475 const Index columns_number = get_columns_number();
2476
2477 Tensor<string, 1> columns_names(columns_number);
2478
2479 for(Index i = 0; i < columns_number; i++)
2480 {
2481 columns_names(i) = columns(i).name;
2482 }
2483
2484 return columns_names;
2485}
2486
2487
2488Tensor<string, 1> DataSet::get_time_series_columns_names() const
2489{
2490 const Index columns_number = get_time_series_columns_number();
2491
2492 Tensor<string, 1> columns_names(columns_number);
2493
2494 for(Index i = 0; i < columns_number; i++)
2495 {
2496 columns_names(i) = time_series_columns(i).name;
2497 }
2498
2499 return columns_names;
2500}
2501
2502
2504
2505Tensor<string, 1> DataSet::get_input_columns_names() const
2506{
2507 const Index input_columns_number = get_input_columns_number();
2508
2509 Tensor<string, 1> input_columns_names(input_columns_number);
2510
2511 Index index = 0;
2512
2513 for(Index i = 0; i < columns.size(); i++)
2514 {
2515 if(columns(i).column_use == VariableUse::Input)
2516 {
2517 input_columns_names(index) = columns(i).name;
2518 index++;
2519 }
2520 }
2521
2522 return input_columns_names;
2523}
2524
2525
2527
2528Tensor<string, 1> DataSet::get_target_columns_names() const
2529{
2530 const Index target_columns_number = get_target_columns_number();
2531
2532 Tensor<string, 1> target_columns_names(target_columns_number);
2533
2534 Index index = 0;
2535
2536 for(Index i = 0; i < columns.size(); i++)
2537 {
2538 if(columns(i).column_use == VariableUse::Target)
2539 {
2540 target_columns_names(index) = columns(i).name;
2541 index++;
2542 }
2543 }
2544
2545 return target_columns_names;
2546}
2547
2548
2550
2551Tensor<string, 1> DataSet::get_used_columns_names() const
2552{
2553 const Index columns_number = get_columns_number();
2554 const Index used_columns_number = get_used_columns_number();
2555
2556 Tensor<string, 1> names(used_columns_number);
2557
2558 Index index = 0 ;
2559
2560 for(Index i = 0; i < columns_number; i++)
2561 {
2562 if(columns(i).column_use != VariableUse::UnusedVariable)
2563 {
2564 names(index) = columns(i).name;
2565 index++;
2566 }
2567 }
2568
2569 return names;
2570}
2571
2572
2574
2576{
2577 Index input_columns_number = 0;
2578
2579 for(Index i = 0; i < columns.size(); i++)
2580 {
2581 if(columns(i).column_use == VariableUse::Input)
2582 {
2583 input_columns_number++;
2584 }
2585 }
2586
2587 return input_columns_number;
2588}
2589
2590
2591Index DataSet::get_input_time_series_columns_number() const
2592{
2593 Index input_columns_number = 0;
2594
2595 for(Index i = 0; i < time_series_columns.size(); i++)
2596 {
2597 if(time_series_columns(i).column_use == VariableUse::Input)
2598 {
2599 input_columns_number++;
2600 }
2601 }
2602
2603 return input_columns_number;
2604}
2605
2606
2608
2610{
2611 Index target_columns_number = 0;
2612
2613 for(Index i = 0; i < columns.size(); i++)
2614 {
2615 if(columns(i).column_use == VariableUse::Target)
2616 {
2617 target_columns_number++;
2618 }
2619 }
2620
2621 return target_columns_number;
2622}
2623
2624
2625Index DataSet::get_target_time_series_columns_number() const
2626{
2627 Index target_columns_number = 0;
2628
2629 for(Index i = 0; i < time_series_columns.size(); i++)
2630 {
2631 if(time_series_columns(i).column_use == VariableUse::Target)
2632 {
2633 target_columns_number++;
2634 }
2635 }
2636
2637 return target_columns_number;
2638}
2639
2640
2642
2644{
2645 Index time_columns_number = 0;
2646
2647 for(Index i = 0; i < columns.size(); i++)
2648 {
2649 if(columns(i).column_use == VariableUse::Time)
2650 {
2651 time_columns_number++;
2652 }
2653 }
2654
2655 return time_columns_number;
2656}
2657
2658
2660
2662{
2663 Index unused_columns_number = 0;
2664
2665 for(Index i = 0; i < columns.size(); i++)
2666 {
2667 if(columns(i).column_use == VariableUse::UnusedVariable)
2668 {
2669 unused_columns_number++;
2670 }
2671 }
2672
2673 return unused_columns_number;
2674}
2675
2676
2678
2680{
2681 Index used_columns_number = 0;
2682
2683 for(Index i = 0; i < columns.size(); i++)
2684 {
2685 if(columns(i).column_use != VariableUse::UnusedVariable)
2686 {
2687 used_columns_number++;
2688 }
2689 }
2690
2691 return used_columns_number;
2692}
2693
2694
2696
2697Tensor<DataSet::Column, 1> DataSet::get_columns() const
2698{
2699 return columns;
2700}
2701
2702
2703Tensor<DataSet::Column, 1> DataSet::get_time_series_columns() const
2704{
2705 return time_series_columns;
2706}
2707
2708
2709Index DataSet::get_time_series_data_rows_number() const
2710{
2711 return time_series_data.dimension(0);
2712}
2713
2714
2716
2717Tensor<DataSet::Column, 1> DataSet::get_input_columns() const
2718{
2719 const Index inputs_number = get_input_columns_number();
2720
2721 Tensor<Column, 1> input_columns(inputs_number);
2722 Index input_index = 0;
2723
2724 for(Index i = 0; i < columns.size(); i++)
2725 {
2726 if(columns(i).column_use == VariableUse::Input)
2727 {
2728 input_columns(input_index) = columns(i);
2729 input_index++;
2730 }
2731 }
2732
2733 return input_columns;
2734}
2735
2736
2738
2740{
2741 const Index columns_number = get_columns_number();
2742
2743 Tensor<bool, 1> input_columns_binary(columns_number);
2744
2745 for(Index i = 0; i < columns_number; i++)
2746 {
2747 if(columns(i).column_use == VariableUse::Input)
2748 input_columns_binary(i) = true;
2749 else
2750 input_columns_binary(i) = false;
2751 }
2752
2753 return input_columns_binary;
2754}
2755
2756
2758
2759Tensor<DataSet::Column, 1> DataSet::get_target_columns() const
2760{
2761 const Index targets_number = get_target_columns_number();
2762
2763 Tensor<Column, 1> target_columns(targets_number);
2764 Index target_index = 0;
2765
2766 for(Index i = 0; i < columns.size(); i++)
2767 {
2768 if(columns(i).column_use == VariableUse::Target)
2769 {
2770 target_columns(target_index) = columns(i);
2771 target_index++;
2772 }
2773 }
2774
2775 return target_columns;
2776}
2777
2778
2780
2781Tensor<DataSet::Column, 1> DataSet::get_used_columns() const
2782{
2783 const Index used_columns_number = get_used_columns_number();
2784
2785 const Tensor<Index, 1> used_columns_indices = get_used_columns_indices();
2786
2787 Tensor<DataSet::Column, 1> used_columns(used_columns_number);
2788
2789 for(Index i = 0; i < used_columns_number; i++)
2790 {
2791 used_columns(i) = columns(used_columns_indices(i));
2792 }
2793
2794 return used_columns;
2795}
2796
2797
2799
2801{
2802 return columns.size();
2803}
2804
2806
2808{
2809 return time_series_columns.size();
2810}
2811
2813
2815{
2816 Index variables_number = 0;
2817
2818 for (Index i = 0; i < columns.size(); i++)
2819 {
2820 if (columns(i).type == ColumnType::Categorical)
2821 {
2822 variables_number += columns(i).categories.size();
2823 }
2824 else
2825 {
2826 variables_number++;
2827 }
2828 }
2829
2830 return variables_number;
2831}
2832
2834
2836{
2837 Index variables_number = 0;
2838
2839 for(Index i = 0; i < time_series_columns.size(); i++)
2840 {
2841 if(columns(i).type == ColumnType::Categorical)
2842 {
2843 variables_number += time_series_columns(i).categories.size();
2844 }
2845 else
2846 {
2847 variables_number++;
2848 }
2849 }
2850
2851 return variables_number;
2852}
2853
2854
2858
2860{
2861 Index inputs_number = 0;
2862
2863 for(Index i = 0; i < columns.size(); i++)
2864 {
2865 if(columns(i).type == ColumnType::Categorical)
2866 {
2867 for(Index j = 0; j < columns(i).categories_uses.size(); j++)
2868 {
2869 if(columns(i).categories_uses(j) == VariableUse::Input) inputs_number++;
2870 }
2871 }
2872 else if(columns(i).column_use == VariableUse::Input)
2873 {
2874 inputs_number++;
2875 }
2876 }
2877
2878 return inputs_number;
2879}
2880
2881
2883
2885{
2886 Index targets_number = 0;
2887
2888 for(Index i = 0; i < columns.size(); i++)
2889 {
2890 if(columns(i).type == ColumnType::Categorical)
2891 {
2892 for(Index j = 0; j < columns(i).categories_uses.size(); j++)
2893 {
2894 if(columns(i).categories_uses(j) == VariableUse::Target) targets_number++;
2895 }
2896
2897 }
2898 else if(columns(i).column_use == VariableUse::Target)
2899 {
2900 targets_number++;
2901 }
2902 }
2903
2904 return targets_number;
2905}
2906
2907
2909
2911{
2912 Index unused_number = 0;
2913
2914 for(Index i = 0; i < columns.size(); i++)
2915 {
2916 if(columns(i).type == ColumnType::Categorical)
2917 {
2918 for(Index j = 0; j < columns(i).categories_uses.size(); j++)
2919 {
2920 if(columns(i).categories_uses(j) == VariableUse::UnusedVariable) unused_number++;
2921 }
2922
2923 }
2924 else if(columns(i).column_use == VariableUse::UnusedVariable)
2925 {
2926 unused_number++;
2927 }
2928 }
2929
2930 return unused_number;
2931}
2932
2933
2936
2937Index DataSet::get_variable_index(const string& name) const
2938{
2939 const Index variables_number = get_variables_number();
2940
2941 const Tensor<string, 1> variables_names = get_variables_names();
2942
2943 for(Index i = 0; i < variables_number; i++)
2944 {
2945 if(variables_names(i) == name) return i;
2946 }
2947
2948 return 0;
2949
2950// throw exception("Exception: Index DataSet::get_variable_index(const string& name) const");
2951}
2952
2953
2955
2957{
2958 const Index unused_number = get_unused_variables_number();
2959
2960 const Tensor<Index, 1> unused_columns_indices = get_unused_columns_indices();
2961
2962 Tensor<Index, 1> unused_indices(unused_number);
2963
2964 Index unused_index = 0;
2965 Index unused_variable_index = 0;
2966
2967 for(Index i = 0; i < columns.size(); i++)
2968 {
2969 if(columns(i).type == ColumnType::Categorical)
2970 {
2971 const Index current_categories_number = columns(i).get_categories_number();
2972
2973 for(Index j = 0; j < current_categories_number; j++)
2974 {
2975 if(columns(i).categories_uses(j) == VariableUse::UnusedVariable)
2976 {
2977 unused_indices(unused_index) = unused_variable_index;
2978 unused_index++;
2979 }
2980
2981 unused_variable_index++;
2982 }
2983 }
2984 else if(columns(i).column_use == VariableUse::UnusedVariable)
2985 {
2986 unused_indices(unused_index) = i;
2987 unused_index++;
2988 unused_variable_index++;
2989 }
2990 else
2991 {
2992 unused_variable_index++;
2993 }
2994 }
2995
2996 return unused_indices;
2997}
2998
2999
3001
3003{
3004 const Index used_number = get_used_variables_number();
3005
3006 Tensor<Index, 1> used_indices(used_number);
3007
3008 Index used_index = 0;
3009 Index used_variable_index = 0;
3010
3011 for(Index i = 0; i < columns.size(); i++)
3012 {
3013 if(columns(i).type == ColumnType::Categorical)
3014 {
3015 const Index current_categories_number = columns(i).get_categories_number();
3016
3017 for(Index j = 0; j < current_categories_number; j++)
3018 {
3019 if(columns(i).categories_uses(j) != VariableUse::UnusedVariable)
3020 {
3021 used_indices(used_index) = used_variable_index;
3022 used_index++;
3023 }
3024
3025 used_variable_index++;
3026 }
3027 }
3028 else if(columns(i).column_use != VariableUse::UnusedVariable)
3029 {
3030 used_indices(used_index) = used_variable_index;
3031 used_index++;
3032 used_variable_index++;
3033 }
3034 else
3035 {
3036 used_variable_index++;
3037 }
3038 }
3039
3040 return used_indices;
3041}
3042
3043
3044
3046
3048{
3049 const Index inputs_number = get_input_variables_number();
3050
3051 const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
3052
3053 Tensor<Index, 1> input_variables_indices(inputs_number);
3054
3055 Index input_index = 0;
3056 Index input_variable_index = 0;
3057
3058 for(Index i = 0; i < columns.size(); i++)
3059 {
3060
3061 if(columns(i).type == ColumnType::Categorical)
3062 {
3063 const Index current_categories_number = columns(i).get_categories_number();
3064
3065 for(Index j = 0; j < current_categories_number; j++)
3066 {
3067 if(columns(i).categories_uses(j) == VariableUse::Input)
3068 {
3069 input_variables_indices(input_index) = input_variable_index;
3070 input_index++;
3071 }
3072
3073 input_variable_index++;
3074 }
3075 }
3076 else if(columns(i).column_use == VariableUse::Input) // Binary, numeric
3077 {
3078 input_variables_indices(input_index) = input_variable_index;
3079 input_index++;
3080 input_variable_index++;
3081 }
3082 else
3083 {
3084 input_variable_index++;
3085 }
3086 }
3087
3088 return input_variables_indices;
3089}
3090
3091
3093
3095{
3096 const Index targets_number = get_target_variables_number();
3097
3098 const Tensor<Index, 1> target_columns_indices = get_target_columns_indices();
3099
3100 Tensor<Index, 1> target_variables_indices(targets_number);
3101
3102 Index target_index = 0;
3103 Index target_variable_index = 0;
3104
3105 for(Index i = 0; i < columns.size(); i++)
3106 {
3107 if(columns(i).type == ColumnType::Categorical)
3108 {
3109 const Index current_categories_number = columns(i).get_categories_number();
3110
3111 for(Index j = 0; j < current_categories_number; j++)
3112 {
3113 if(columns(i).categories_uses(j) == VariableUse::Target)
3114 {
3115 target_variables_indices(target_index) = target_variable_index;
3116 target_index++;
3117 }
3118
3119 target_variable_index++;
3120 }
3121 }
3122 else if(columns(i).column_use == VariableUse::Target) // Binary, numeric
3123 {
3124 target_variables_indices(target_index) = target_variable_index;
3125 target_index++;
3126 target_variable_index++;
3127 }
3128 else
3129 {
3130 target_variable_index++;
3131 }
3132 }
3133
3134 return target_variables_indices;
3135}
3136
3137
3141
3142void DataSet::set_columns_uses(const Tensor<string, 1>& new_columns_uses)
3143{
3144 const Index new_columns_uses_size = new_columns_uses.size();
3145
3146 if(new_columns_uses_size != columns.size())
3147 {
3148 ostringstream buffer;
3149
3150 buffer << "OpenNN Exception: DataSet class.\n"
3151 << "void set_columns_uses(const Tensor<string, 1>&) method.\n"
3152 << "Size of columns uses ("
3153 << new_columns_uses_size << ") must be equal to columns size ("
3154 << columns.size() << "). \n";
3155
3156 throw logic_error(buffer.str());
3157 }
3158
3159 for(Index i = 0; i < new_columns_uses.size(); i++)
3160 {
3161 columns(i).set_use(new_columns_uses(i));
3162 }
3163
3164 input_variables_dimensions.resize(1);
3165 input_variables_dimensions.setConstant(get_input_variables_number());
3166}
3167
3168
3172
3173void DataSet::set_columns_uses(const Tensor<VariableUse, 1>& new_columns_uses)
3174{
3175 const Index new_columns_uses_size = new_columns_uses.size();
3176
3177 if(new_columns_uses_size != columns.size())
3178 {
3179 ostringstream buffer;
3180
3181 buffer << "OpenNN Exception: DataSet class.\n"
3182 << "void set_columns_uses(const Tensor<string, 1>&) method.\n"
3183 << "Size of columns uses (" << new_columns_uses_size << ") must be equal to columns size (" << columns.size() << "). \n";
3184
3185 throw logic_error(buffer.str());
3186 }
3187
3188 for(Index i = 0; i < new_columns_uses.size(); i++)
3189 {
3190 columns(i).set_use(new_columns_uses(i));
3191 }
3192
3193 input_variables_dimensions.resize(1);
3194 input_variables_dimensions.setConstant(get_input_variables_number());
3195}
3196
3197
3199
3201{
3202 const Index columns_number = get_columns_number();
3203
3204 for(Index i = 0; i < columns_number; i++)
3205 {
3206 set_column_use(i, VariableUse::UnusedVariable);
3207 }
3208}
3209
3210
3211void DataSet::set_input_target_columns(const Tensor<Index, 1>& input_columns, const Tensor<Index, 1>& target_columns)
3212{
3214
3215 for(Index i = 0; i < input_columns.size(); i++)
3216 {
3217 set_column_use(input_columns(i), VariableUse::Input);
3218 }
3219
3220 for(Index i = 0; i < target_columns.size(); i++)
3221 {
3222 set_column_use(target_columns(i), VariableUse::Target);
3223 }
3224}
3225
3226
3228
3230{
3231 const Index columns_number = get_columns_number();
3232
3233 for(Index i = 0; i < columns_number; i++)
3234 {
3235 if(columns(i).column_use == DataSet::VariableUse::Input) set_column_use(i, VariableUse::UnusedVariable);
3236 }
3237}
3238
3239
3240
3241void DataSet::set_input_columns(const Tensor<Index, 1>& input_columns_indices, const Tensor<bool, 1>& input_columns_use)
3242{
3243 for(Index i = 0; i < input_columns_indices.size(); i++)
3244 {
3245 if(input_columns_use(i)) set_column_use(input_columns_indices(i), VariableUse::Input);
3246 else set_column_use(input_columns_indices(i), VariableUse::UnusedVariable);
3247 }
3248}
3249
3250
3254
3255void DataSet::set_column_use(const Index& index, const VariableUse& new_use)
3256{
3257 columns(index).column_use = new_use;
3258
3259 if(columns(index).type == ColumnType::Categorical)
3260 {
3261 columns(index).set_categories_uses(new_use);
3262 }
3263}
3264
3265
3269
3270void DataSet::set_column_use(const string& name, const VariableUse& new_use)
3271{
3272 const Index index = get_column_index(name);
3273
3274 set_column_use(index, new_use);
3275}
3276
3277void DataSet::set_column_type(const Index& index, const ColumnType& new_type)
3278{
3279 columns[index].type = new_type;
3280}
3281
3282
3283void DataSet::set_column_type(const string& name, const ColumnType& new_type)
3284{
3285 const Index index = get_column_index(name);
3286
3287 set_column_type(index, new_type);
3288}
3289
3290
3294
3295void DataSet::set_variable_name(const Index& variable_index, const string& new_variable_name)
3296{
3297#ifdef OPENNN_DEBUG
3298
3299 const Index variables_number = get_variables_number();
3300
3301 if(variable_index >= variables_number)
3302 {
3303 ostringstream buffer;
3304
3305 buffer << "OpenNN Exception: Variables class.\n"
3306 << "void set_name(const Index&, const string&) method.\n"
3307 << "Index of variable must be less than number of variables.\n";
3308
3309 throw logic_error(buffer.str());
3310 }
3311
3312#endif
3313
3314 const Index columns_number = get_columns_number();
3315
3316 Index index = 0;
3317
3318 for(Index i = 0; i < columns_number; i++)
3319 {
3320 if(columns(i).type == ColumnType::Categorical)
3321 {
3322 for(Index j = 0; j < columns(i).get_categories_number(); j++)
3323 {
3324 if(index == variable_index)
3325 {
3326 columns(i).categories(j) = new_variable_name;
3327 return;
3328 }
3329 else
3330 {
3331 index++;
3332 }
3333 }
3334 }
3335 else
3336 {
3337 if(index == variable_index)
3338 {
3339 columns(i).name = new_variable_name;
3340 return;
3341 }
3342 else
3343 {
3344 index++;
3345 }
3346 }
3347 }
3348}
3349
3350
3354
3355void DataSet::set_variables_names(const Tensor<string, 1>& new_variables_names)
3356{
3357#ifdef OPENNN_DEBUG
3358
3359 const Index variables_number = get_variables_number();
3360
3361 const Index size = new_variables_names.size();
3362
3363 if(size != variables_number)
3364 {
3365 ostringstream buffer;
3366
3367 buffer << "OpenNN Exception: Variables class.\n"
3368 << "void set_names(const Tensor<string, 1>&) method.\n"
3369 << "Size (" << size << ") must be equal to number of variables (" << variables_number << ").\n";
3370
3371 throw logic_error(buffer.str());
3372 }
3373
3374#endif
3375
3376 const Index columns_number = get_columns_number();
3377
3378 Index index = 0;
3379
3380 for(Index i = 0; i < columns_number; i++)
3381 {
3382 if(columns(i).type == ColumnType::Categorical)
3383 {
3384 for(Index j = 0; j < columns(i).get_categories_number(); j++)
3385 {
3386 columns(i).categories(j) = new_variables_names(index);
3387 index++;
3388 }
3389 }
3390 else
3391 {
3392 columns(i).name = new_variables_names(index);
3393 index++;
3394 }
3395 }
3396}
3397
3398
3402
3403void DataSet::set_columns_names(const Tensor<string, 1>& new_names)
3404{
3405 const Index new_names_size = new_names.size();
3406 const Index columns_number = get_columns_number();
3407
3408 if(new_names_size != columns_number)
3409 {
3410 ostringstream buffer;
3411
3412 buffer << "OpenNN Exception: DataSet class.\n"
3413 << "void set_columns_names(const Tensor<string, 1>&).\n"
3414 << "Size of names (" << new_names.size() << ") is not equal to columns number (" << columns_number << ").\n";
3415
3416 throw logic_error(buffer.str());
3417 }
3418
3419 for(Index i = 0; i < columns_number; i++)
3420 {
3421 columns(i).name = new_names(i);
3422 }
3423}
3424
3425
3427
3429{
3430 for(Index i = 0; i < columns.size(); i++)
3431 {
3432 if(columns(i).type == ColumnType::Constant) continue;
3433
3434 columns(i).set_use(VariableUse::Input);
3435 }
3436}
3437
3438
3440
3442{
3443 for(Index i = 0; i < columns.size(); i++)
3444 {
3445 columns(i).set_use(VariableUse::Target);
3446 }
3447}
3448
3449
3451
3453{
3454 for(Index i = 0; i < columns.size(); i++)
3455 {
3456 columns(i).set_use(VariableUse::UnusedVariable);
3457 }
3458}
3459
3460
3464
3465void DataSet::set_columns_number(const Index& new_columns_number)
3466{
3467 columns.resize(new_columns_number);
3468
3470}
3471
3472
3473void DataSet::set_columns_scalers(const Scaler& scalers)
3474{
3475 const Index columns_number = get_columns_number();
3476
3477 for(Index i = 0; i < columns_number; i++)
3478 {
3479 columns(i).scaler = scalers;
3480 }
3481}
3482
3483void DataSet::set_binary_simple_columns()
3484{
3485 bool is_binary = true;
3486
3487 Index variable_index = 0;
3488
3489 Index different_values = 0;
3490
3491 for(Index column_index = 0; column_index < columns.size(); column_index++)
3492 {
3493 if(columns(column_index).type == ColumnType::Numeric)
3494 {
3495 Tensor<type, 1> values(3);
3496 values.setRandom();
3497 different_values = 0;
3498 is_binary = true;
3499
3500 for(Index row_index = 0; row_index < data.dimension(0); row_index++)
3501 {
3502 if(!isnan(data(row_index, variable_index))
3503 && data(row_index, variable_index) != values(0)
3504 && data(row_index, variable_index) != values(1))
3505 {
3506 values(different_values) = data(row_index, variable_index);
3507
3508 different_values++;
3509 }
3510
3511 if(row_index == (data.dimension(0)-1)){
3512 if(different_values == 1){
3513 is_binary = false;
3514 break;
3515 }
3516 }
3517
3518 if(different_values > 2)
3519 {
3520 is_binary = false;
3521 break;
3522 }
3523 }
3524
3525 if(is_binary)
3526 {
3527 columns(column_index).type = ColumnType::Binary;
3528 scale_minimum_maximum_binary(data, values(0), values(1), column_index);
3529 columns(column_index).categories.resize(2);
3530
3531 if(values(0) == type(0) && values(1) == type(1))
3532 {
3533 columns(column_index).categories(0) = "Negative (0)";
3534 columns(column_index).categories(1) = "Positive (1)";
3535 }
3536 else if(values(0) == type(1) && values(1) == type(0))
3537 {
3538 columns(column_index).categories(0) = "Positive (1)";
3539 columns(column_index).categories(1) = "Negative (0)";
3540 }
3541 else
3542 {
3543 columns(column_index).categories(0) = "Class_1";
3544 columns(column_index).categories(1) = "Class_2";
3545 }
3546
3547 const VariableUse column_use = columns(column_index).column_use;
3548 columns(column_index).categories_uses.resize(2);
3549 columns(column_index).categories_uses(0) = column_use;
3550 columns(column_index).categories_uses(1) = column_use;
3551 }
3552
3553 variable_index++;
3554 }
3555 else if(columns(column_index).type == ColumnType::Categorical)
3556 {
3557 variable_index += columns(column_index).get_categories_number();
3558 }
3559 else
3560 {
3561 variable_index++;
3562 }
3563 }
3564}
3565
3566void DataSet::check_constant_columns()
3567{
3568 if(display) cout << "Checking constant columns..." << endl;
3569
3570 Index variable_index = 0;
3571
3572 for(Index column = 0; column < get_columns_number(); column++)
3573 {
3574 if(columns(column).type == ColumnType::Numeric)
3575 {
3576 // @todo avoid chip
3577
3578 const Tensor<type, 1> numeric_column = data.chip(variable_index, 1);
3579
3580 if(standard_deviation(numeric_column) < static_cast<type>(1.0e-3))
3581 {
3582 columns(column).type = ColumnType::Constant;
3583 columns(column).column_use = VariableUse::UnusedVariable;
3584 }
3585
3586 variable_index++;
3587 }
3588 else if(columns(column).type == ColumnType::DateTime)
3589 {
3590 columns(column).column_use = VariableUse::UnusedVariable;
3591 variable_index++;
3592 }
3593 else if(columns(column).type == ColumnType::Constant)
3594 {
3595 variable_index++;
3596 }
3597 else if(columns(column).type == ColumnType::Binary)
3598 {
3599 if(columns(column).get_categories_number() == 1)
3600 {
3601 columns(column).type = ColumnType::Constant;
3602 columns(column).column_use = VariableUse::UnusedVariable;
3603 }
3604
3605 variable_index++;
3606 }
3607 else if(columns(column).type == ColumnType::Categorical)
3608 {
3609 if(columns(column).get_categories_number() == 1)
3610 {
3611 columns(column).type = ColumnType::Constant;
3612 columns(column).column_use = VariableUse::UnusedVariable;
3613 }
3614
3615 variable_index += columns(column).get_categories_number();
3616 }
3617 }
3618}
3619
3620Tensor<type, 2> DataSet::transform_binary_column(const Tensor<type, 1>& column) const
3621
3622{
3623 const Index rows_number = column.dimension(0);
3624
3625 Tensor<type, 2> new_column(rows_number , 2);
3626 new_column.setZero();
3627
3628 for(Index i = 0; i < rows_number; i++)
3629 {
3630 if(abs(column(i) - static_cast<type>(1)) < type(NUMERIC_LIMITS_MIN))
3631 {
3632 new_column(i,1) = static_cast<type>(1);
3633 }
3634 else if(abs(column(i)) < type(NUMERIC_LIMITS_MIN))
3635 {
3636 new_column(i,0) = static_cast<type>(1);
3637 }
3638 else
3639 {
3640 new_column(i,0) = type(NAN);
3641 new_column(i,1) = type(NAN);
3642 }
3643 }
3644
3645 return new_column;
3646}
3647
3649
3650void DataSet::set_input_variables_dimensions(const Tensor<Index, 1>& new_inputs_dimensions)
3651{
3652 input_variables_dimensions = new_inputs_dimensions;
3653}
3654
3655
3657
3659{
3660 if(data.dimension(0) == 0 || data.dimension(1) == 0)
3661 {
3662 return true;
3663 }
3664
3665 return false;
3666}
3667
3668
3672
3673const Tensor<type, 2>& DataSet::get_data() const
3674{
3675 return data;
3676}
3677
3678
3679Tensor<type, 2>* DataSet::get_data_pointer()
3680{
3681 return &data;
3682}
3683
3684
3687
3688const Tensor<type, 2>& DataSet::get_time_series_data() const
3689{
3690 return time_series_data;
3691}
3692
3693
3695
3697{
3698 return missing_values_method;
3699}
3700
3701
3703
3704const string& DataSet::get_data_file_name() const
3705{
3706 return data_file_name;
3707}
3708
3709
3711
3712const bool& DataSet::get_header_line() const
3713{
3714 return has_columns_names;
3715}
3716
3717
3719
3720const bool& DataSet::get_rows_label() const
3721{
3722 return has_rows_labels;
3723}
3724
3725
3726Tensor<string, 1> DataSet::get_rows_label_tensor() const
3727{
3728 return rows_labels;
3729}
3730
3731Tensor<string, 1> DataSet::get_testing_rows_label_tensor()
3732{
3733 const Index testing_samples_number = get_testing_samples_number();
3734 const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
3735 Tensor<string, 1> testing_rows_label(testing_samples_number);
3736
3737 for(Index i = 0; i < testing_samples_number; i++)
3738 {
3739 testing_rows_label(i) = rows_labels(testing_indices(i));
3740 }
3741
3742 return testing_rows_label;
3743}
3744
3745
3746Tensor<string, 1> DataSet::get_selection_rows_label_tensor()
3747{
3748 const Index selection_samples_number = get_selection_samples_number();
3749 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
3750 Tensor<string, 1> selection_rows_label(selection_samples_number);
3751
3752 for(Index i = 0; i < selection_samples_number; i++)
3753 {
3754 selection_rows_label(i) = rows_labels(selection_indices(i));
3755 }
3756
3757 return selection_rows_label;
3758}
3759
3760
3762
3764{
3765 return separator;
3766}
3767
3768
3770
3772{
3773 switch(separator)
3774 {
3775 case Separator::Space:
3776 return ' ';
3777
3778 case Separator::Tab:
3779 return '\t';
3780
3781 case Separator::Comma:
3782 return ',';
3783
3784 case Separator::Semicolon:
3785 return ';';
3786 }
3787
3788 return char();
3789}
3790
3791
3793
3795{
3796 switch(separator)
3797 {
3798 case Separator::Space:
3799 return "Space";
3800
3801 case Separator::Tab:
3802 return "Tab";
3803
3804 case Separator::Comma:
3805 return "Comma";
3806
3807 case Separator::Semicolon:
3808 return "Semicolon";
3809 }
3810
3811 return string();
3812}
3813
3814
3816
3818{
3819 return missing_values_label;
3820}
3821
3822
3824
3825const Index& DataSet::get_lags_number() const
3826{
3827 return lags_number;
3828}
3829
3830
3832
3833const Index& DataSet::get_steps_ahead() const
3834{
3835 return steps_ahead;
3836}
3837
3838
3840
3841const string& DataSet::get_time_column() const
3842{
3843 return time_column;
3844}
3845
3846
3847Index DataSet::get_time_series_time_column_index() const
3848{
3849 for(Index i = 0; i < time_series_columns.size(); i++)
3850 {
3851 if(time_series_columns(i).type == ColumnType::DateTime) return i;
3852 }
3853
3854 return static_cast<Index>(NAN);
3855}
3856
3857
3860
3861Scaler DataSet::get_scaling_unscaling_method(const string& scaling_unscaling_method)
3862{
3863 if(scaling_unscaling_method == "NoScaling")
3864 {
3865 return Scaler::NoScaling;
3866 }
3867 else if(scaling_unscaling_method == "MinimumMaximum")
3868 {
3869 return Scaler::MinimumMaximum;
3870 }
3871 else if(scaling_unscaling_method == "Logarithmic")
3872 {
3873 return Scaler::Logarithm;
3874 }
3875 else if(scaling_unscaling_method == "MeanStandardDeviation")
3876 {
3877 return Scaler::MeanStandardDeviation;
3878 }
3879 else if(scaling_unscaling_method == "StandardDeviation")
3880 {
3881 return Scaler::StandardDeviation;
3882 }
3883 else
3884 {
3885 ostringstream buffer;
3886
3887 buffer << "OpenNN Exception: DataSet class.\n"
3888 << "static Scaler get_scaling_unscaling_method(const string).\n"
3889 << "Unknown scaling-unscaling method: " << scaling_unscaling_method << ".\n";
3890
3891 throw logic_error(buffer.str());
3892 }
3893}
3894
3895
3899
3900Tensor<type, 2> DataSet::get_training_data() const
3901{
3902
3903// const Index variables_number = get_variables_number();
3904
3905// Tensor<Index, 1> variables_indices(0, 1, variables_number-1);
3906
3907 Tensor<Index, 1> variables_indices = get_used_variables_indices();
3908
3909 const Tensor<Index, 1> training_indices = get_training_samples_indices();
3910
3911 return get_subtensor_data(training_indices, variables_indices);
3912
3913// return Tensor<type, 2>();
3914}
3915
3916
3920
3921Tensor<type, 2> DataSet::get_selection_data() const
3922{
3923 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
3924
3925 const Index variables_number = get_variables_number();
3926
3927 Tensor<Index, 1> variables_indices;
3928 initialize_sequential(variables_indices, 0, 1, variables_number-1);
3929
3930 return get_subtensor_data(selection_indices, variables_indices);
3931}
3932
3933
3937
3938Tensor<type, 2> DataSet::get_testing_data() const
3939{
3940 const Index variables_number = get_variables_number();
3941
3942 Tensor<Index, 1> variables_indices;
3943 initialize_sequential(variables_indices, 0, 1, variables_number-1);
3944
3945 const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
3946
3947 return get_subtensor_data(testing_indices, variables_indices);
3948}
3949
3950
3954
3955Tensor<type, 2> DataSet::get_input_data() const
3956{
3957 const Index samples_number = get_samples_number();
3958
3959 Tensor<Index, 1> indices;
3960 initialize_sequential(indices, 0, 1, samples_number-1);
3961
3962 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3963
3964 return get_subtensor_data(indices, input_variables_indices);
3965}
3966
3967
3971
3972Tensor<type, 2> DataSet::get_target_data() const
3973{
3974 const Tensor<Index, 1> indices = get_used_samples_indices();
3975
3976 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
3977
3978 return get_subtensor_data(indices, target_variables_indices);
3979}
3980
3981
3985
3986Tensor<type, 2> DataSet::get_input_data(const Tensor<Index, 1>& samples_indices) const
3987{
3988 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
3989
3990 return get_subtensor_data(samples_indices, input_variables_indices);
3991}
3992
3993
3997
3998Tensor<type, 2> DataSet::get_target_data(const Tensor<Index, 1>& samples_indices) const
3999{
4000 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
4001
4002 return get_subtensor_data(samples_indices, target_variables_indices);
4003}
4004
4005
4009
4011{
4012 const Tensor<Index, 1> training_indices = get_training_samples_indices();
4013
4014 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
4015
4016 return get_subtensor_data(training_indices, input_variables_indices);
4017}
4018
4019
4023
4025{
4026 const Tensor<Index, 1> training_indices = get_training_samples_indices();
4027
4028 const Tensor<Index, 1>& target_variables_indices = get_target_variables_indices();
4029
4030 return get_subtensor_data(training_indices, target_variables_indices);
4031}
4032
4033
4037
4039{
4040 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
4041
4042 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
4043
4044 return get_subtensor_data(selection_indices, input_variables_indices);
4045}
4046
4047
4051
4053{
4054 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
4055
4056 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
4057
4058 return get_subtensor_data(selection_indices, target_variables_indices);
4059}
4060
4061
4065
4066Tensor<type, 2> DataSet::get_testing_input_data() const
4067{
4068 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
4069
4070 const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
4071
4072 return get_subtensor_data(testing_indices, input_variables_indices);
4073}
4074
4075
4079
4081{
4082 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
4083
4084 const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
4085
4086 return get_subtensor_data(testing_indices, target_variables_indices);
4087}
4088
4089
4092
4093Tensor<type, 1> DataSet::get_sample_data(const Index& index) const
4094{
4095
4096#ifdef OPENNN_DEBUG
4097
4098 const Index samples_number = get_samples_number();
4099
4100 if(index >= samples_number)
4101 {
4102 ostringstream buffer;
4103
4104 buffer << "OpenNN Exception: DataSet class.\n"
4105 << "Tensor<type, 1> get_sample(const Index&) const method.\n"
4106 << "Index of sample (" << index << ") must be less than number of samples (" << samples_number << ").\n";
4107
4108 throw logic_error(buffer.str());
4109 }
4110
4111#endif
4112
4113 // Get sample
4114
4115 return data.chip(index,0);
4116}
4117
4118
4122
4123Tensor<type, 1> DataSet::get_sample_data(const Index& sample_index, const Tensor<Index, 1>& variables_indices) const
4124{
4125#ifdef OPENNN_DEBUG
4126
4127 const Index samples_number = get_samples_number();
4128
4129 if(sample_index >= samples_number)
4130 {
4131 ostringstream buffer;
4132
4133 buffer << "OpenNN Exception: DataSet class.\n"
4134 << "Tensor<type, 1> get_sample(const Index&, const Tensor<Index, 1>&) const method.\n"
4135 << "Index of sample must be less than number of \n";
4136
4137 throw logic_error(buffer.str());
4138 }
4139
4140#endif
4141
4142 const Index variables_number = variables_indices.size();
4143
4144 Tensor<type, 1 > row(variables_number);
4145
4146 for(Index i = 0; i < variables_number; i++)
4147 {
4148 Index variable_index = variables_indices(i);
4149
4150 row(i) = data(sample_index, variable_index);
4151 }
4152
4153 return row;
4154
4155 //return data.get_row(sample_index, variables_indices);
4156
4157}
4158
4159
4162
4163Tensor<type, 2> DataSet::get_sample_input_data(const Index& sample_index) const
4164{
4165 const Index input_variables_number = get_input_variables_number();
4166
4167 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
4168
4169 Tensor<type, 2> inputs(1, input_variables_number);
4170
4171 for(Index i = 0; i < input_variables_number; i++)
4172 inputs(0, i) = data(sample_index, input_variables_indices(i));
4173
4174 return inputs;
4175}
4176
4177
4180
4181Tensor<type, 2> DataSet::get_sample_target_data(const Index& sample_index) const
4182{
4183 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
4184
4185 return get_subtensor_data(Tensor<Index, 1>(sample_index), target_variables_indices);
4186}
4187
4188
4191
4192Index DataSet::get_column_index(const string& column_name) const
4193{
4194 const Index columns_number = get_columns_number();
4195
4196 for(Index i = 0; i < columns_number; i++)
4197 {
4198 if(columns(i).name == column_name) return i;
4199 }
4200
4201 ostringstream buffer;
4202
4203 buffer << "OpenNN Exception: DataSet class.\n"
4204 << "Index get_column_index(const string&&) const method.\n"
4205 << "Cannot find " << column_name << "\n";
4206
4207 throw logic_error(buffer.str());
4208}
4209
4210
4213
4214Index DataSet::get_column_index(const Index& variable_index) const
4215{
4216 const Index columns_number = get_columns_number();
4217
4218 Index total_variables_number = 0;
4219
4220 for(Index i = 0; i < columns_number; i++)
4221 {
4222 if(columns(i).type == ColumnType::Categorical)
4223 {
4224 total_variables_number += columns(i).get_categories_number();
4225 }
4226 else
4227 {
4228 total_variables_number++;
4229 }
4230
4231 if((variable_index+1) <= total_variables_number) return i;
4232 }
4233
4234 ostringstream buffer;
4235
4236 buffer << "OpenNN Exception: DataSet class.\n"
4237 << "Index get_column_index(const type&) const method.\n"
4238 << "Cannot find variable index: " << variable_index << ".\n";
4239
4240 throw logic_error(buffer.str());
4241}
4242
4243
4247
4248Tensor<Index, 1> DataSet::get_variable_indices(const Index& column_index) const
4249{
4250 Index index = 0;
4251
4252 for(Index i = 0; i < column_index; i++)
4253 {
4254 if(columns(i).type == ColumnType::Categorical)
4255 {
4256 index += columns(i).categories.size();
4257 }
4258 else
4259 {
4260 index++;
4261 }
4262 }
4263
4264 if(columns(column_index).type == ColumnType::Categorical)
4265 {
4266 Tensor<Index, 1> variable_indices(columns(column_index).categories.size());
4267
4268 for(Index j = 0; j<columns(column_index).categories.size(); j++)
4269 {
4270 variable_indices(j) = index+j;
4271 }
4272
4273 return variable_indices;
4274 }
4275 else
4276 {
4277 Tensor<Index, 1> indices(1);
4278 indices.setConstant(index);
4279
4280 return indices;
4281 }
4282}
4283
4284
4288
4289Tensor<type, 2> DataSet::get_column_data(const Index& column_index) const
4290{
4291 Index columns_number = 1;
4292 const Index rows_number = data.dimension(0);
4293
4294 if(columns(column_index).type == ColumnType::Categorical)
4295 {
4296 columns_number = columns(column_index).get_categories_number();
4297 }
4298
4299 const Eigen::array<Index, 2> extents = {rows_number, columns_number};
4300 const Eigen::array<Index, 2> offsets = {0, get_variable_indices(column_index)(0)};
4301
4302 return data.slice(offsets, extents);
4303}
4304
4305
4308
4309Tensor<type, 2> DataSet::get_time_series_column_data(const Index& column_index) const
4310{
4311 Index columns_number = 1;
4312
4313 const Index rows_number = time_series_data.dimension(0);
4314
4315 if(time_series_columns(column_index).type == ColumnType::Categorical)
4316 {
4317 columns_number = time_series_columns(column_index).get_categories_number();
4318 }
4319
4320 const Eigen::array<Index, 2> extents = {rows_number, columns_number};
4321 const Eigen::array<Index, 2> offsets = {0, get_variable_indices(column_index)(0)};
4322
4323 return time_series_data.slice(offsets, extents);
4324}
4325
4326
4331
4332Tensor<type, 2> DataSet::get_column_data(const Index& column_index, const Tensor<Index, 1>& rows_indices) const
4333{
4334 return get_subtensor_data(rows_indices, get_variable_indices(column_index));
4335}
4336
4337
4341
4342Tensor<type, 2> DataSet::get_column_data(const string& column_name) const
4343{
4344 const Index column_index = get_column_index(column_name);
4345
4346 return get_column_data(column_index);
4347}
4348
4349
4352
4353Tensor<type, 1> DataSet::get_variable_data(const Index& index) const
4354{
4355
4356#ifdef OPENNN_DEBUG
4357
4358 const Index variables_number = get_variables_number();
4359
4360 if(index >= variables_number)
4361 {
4362 ostringstream buffer;
4363
4364 buffer << "OpenNN Exception: DataSet class.\n"
4365 << "Tensor<type, 1> get_variable(const Index&) const method.\n"
4366 << "Index of variable must be less than number of \n";
4367
4368 throw logic_error(buffer.str());
4369 }
4370
4371#endif
4372
4373 return data.chip(index, 1);
4374}
4375
4376
4379
4380Tensor<type, 1> DataSet::get_variable_data(const string& variable_name) const
4381{
4382
4383 const Tensor<string, 1> variable_names = get_variables_names();
4384
4385 Index size = 0;
4386
4387 for(Index i = 0; i < variable_names.size(); i++)
4388 {
4389 if(variable_names(i) == variable_name) size++;
4390 }
4391
4392 Tensor<Index, 1> variable_index(size);
4393
4394 Index index = 0;
4395
4396 for(Index i = 0; i < variable_names.size(); i++)
4397 {
4398 if(variable_names(i) == variable_name)
4399 {
4400 variable_index(index) = i;
4401
4402 index++;
4403 }
4404 }
4405
4406#ifdef OPENNN_DEBUG
4407
4408 const Index variables_size = variable_index.size();
4409
4410 if(variables_size == 0)
4411 {
4412 ostringstream buffer;
4413
4414 buffer << "OpenNN Exception: DataSet class.\n"
4415 << "Tensor<type, 1> get_variable(const string&) const method.\n"
4416 << "Variable: " << variable_name << " does not exist.\n";
4417
4418 throw logic_error(buffer.str());
4419 }
4420
4421 if(variables_size > 1)
4422 {
4423 ostringstream buffer;
4424
4425 buffer << "OpenNN Exception: DataSet class.\n"
4426 << "Tensor<type, 1> get_variable(const string&) const method.\n"
4427 << "Variable: " << variable_name << " appears more than once in the data set.\n";
4428
4429 throw logic_error(buffer.str());
4430 }
4431
4432#endif
4433
4434 return data.chip(variable_index(0), 1);
4435}
4436
4437
4441
4442Tensor<type, 1> DataSet::get_variable_data(const Index& variable_index, const Tensor<Index, 1>& samples_indices) const
4443{
4444
4445#ifdef OPENNN_DEBUG
4446
4447 const Index variables_number = get_variables_number();
4448
4449 if(variable_index >= variables_number)
4450 {
4451 ostringstream buffer;
4452
4453 buffer << "OpenNN Exception: DataSet class.\n"
4454 << "Tensor<type, 1> get_variable(const Index&, const Tensor<Index, 1>&) const method.\n"
4455 << "Index of variable must be less than number of \n";
4456
4457 throw logic_error(buffer.str());
4458 }
4459
4460#endif
4461
4462 const Index samples_indices_size = samples_indices.size();
4463
4464 Tensor<type, 1 > column(samples_indices_size);
4465
4466 for(Index i = 0; i < samples_indices_size; i++)
4467 {
4468 Index sample_index = samples_indices(i);
4469
4470 column(i) = data(sample_index, variable_index);
4471 }
4472
4473 return column;
4474}
4475
4476
4480
4481Tensor<type, 1> DataSet::get_variable_data(const string& variable_name, const Tensor<Index, 1>& samples_indices) const
4482{
4483
4484 const Tensor<string, 1> variable_names = get_variables_names();
4485
4486 Index size = 0;
4487
4488 for(Index i = 0; i < variable_names.size(); i++)
4489 {
4490 if(variable_names(i) == variable_name) size++;
4491 }
4492
4493 Tensor<Index, 1> variable_index(size);
4494
4495 Index index = 0;
4496
4497 for(Index i = 0; i < variable_names.size(); i++)
4498 {
4499 if(variable_names(i) == variable_name)
4500 {
4501 variable_index(index) = i;
4502
4503 index++;
4504 }
4505 }
4506
4507#ifdef OPENNN_DEBUG
4508
4509 const Index variables_size = variable_index.size();
4510
4511 if(variables_size == 0)
4512 {
4513 ostringstream buffer;
4514
4515 buffer << "OpenNN Exception: DataSet class.\n"
4516 << "Tensor<type, 1> get_variable(const string&) const method.\n"
4517 << "Variable: " << variable_name << " does not exist.\n";
4518
4519 throw logic_error(buffer.str());
4520 }
4521
4522 if(variables_size > 1)
4523 {
4524 ostringstream buffer;
4525
4526 buffer << "OpenNN Exception: DataSet class.\n"
4527 << "Tensor<type, 1> get_variable(const string&, const Tensor<Index, 1>&) const method.\n"
4528 << "Variable: " << variable_name << " appears more than once in the data set.\n";
4529
4530 throw logic_error(buffer.str());
4531 }
4532
4533#endif
4534
4535 const Index samples_indices_size = samples_indices.size();
4536
4537 Tensor<type, 1 > column(samples_indices_size);
4538
4539 for(Index i = 0; i < samples_indices_size; i++)
4540 {
4541 Index sample_index = samples_indices(i);
4542
4543 column(i) = data(sample_index, variable_index(0));
4544 }
4545
4546 return column;
4547}
4548
4549
4550Tensor<Tensor<string, 1>, 1> DataSet::get_data_file_preview() const
4551{
4552 return data_file_preview;
4553}
4554
4555
4556Tensor<type, 2> DataSet::get_subtensor_data(const Tensor<Index, 1> & rows_indices, const Tensor<Index, 1> & variables_indices) const
4557{
4558 const Index rows_number = rows_indices.size();
4559 const Index variables_number = variables_indices.size();
4560
4561 Tensor<type, 2> subtensor(rows_number, variables_number);
4562
4563 Index row_index;
4564 Index variable_index;
4565
4566 const Tensor<type, 2>& data = get_data();
4567
4568 for(Index i = 0; i < rows_number; i++)
4569 {
4570 row_index = rows_indices(i);
4571
4572 for(Index j = 0; j < variables_number; j++)
4573 {
4574 variable_index = variables_indices(j);
4575
4576 subtensor(i, j) = data(row_index, variable_index);
4577 }
4578 }
4579
4580 return subtensor;
4581}
4582
4583
4585
4587{
4588// NonBlockingThreadPool* non_blocking_thread_pool = nullptr;
4589// ThreadPoolDevice* thread_pool_device = nullptr;
4590
4591 data.resize(0,0);
4592
4593 samples_uses.resize(0);
4594
4595 columns.resize(0);
4596
4597 time_series_data.resize(0,0);
4598
4599 time_series_columns.resize(0);
4600
4601
4602 columns_missing_values_number.resize(0);
4603}
4604
4605
4606void DataSet::set(const string& data_file_name, const char& separator, const bool& new_has_columns_names)
4607{
4608 set();
4609
4610 set_default();
4611
4613
4615
4616 set_has_columns_names(new_has_columns_names);
4617
4618 read_csv();
4619
4622}
4623
4624
4627
4628void DataSet::set(const Tensor<type, 2>& new_data)
4629{
4630 data_file_name = "";
4631
4632 const Index variables_number = new_data.dimension(1);
4633 const Index samples_number = new_data.dimension(0);
4634
4635 set(samples_number, variables_number);
4636
4637 data = new_data;
4638
4640}
4641
4642
4648
4649void DataSet::set(const Index& new_samples_number, const Index& new_variables_number)
4650{
4651#ifdef OPENNN_DEBUG
4652
4653 if(new_samples_number == 0)
4654 {
4655 ostringstream buffer;
4656
4657 buffer << "OpenNN Exception: DataSet class.\n"
4658 << "void set(const Index&, const Index&) method.\n"
4659 << "Number of samples must be greater than zero.\n";
4660
4661 throw logic_error(buffer.str());
4662 }
4663
4664 if(new_variables_number == 0)
4665 {
4666 ostringstream buffer;
4667
4668 buffer << "OpenNN Exception: DataSet class.\n"
4669 << "void set(const Index&, const Index&) method.\n"
4670 << "Number of variables must be greater than zero.\n";
4671
4672 throw logic_error(buffer.str());
4673 }
4674
4675#endif
4676
4677 data.resize(new_samples_number, new_variables_number);
4678
4679 columns.resize(new_variables_number);
4680
4681 for(Index index = 0; index < new_variables_number-1; index++)
4682 {
4683 columns(index).name = "column_" + to_string(index+1);
4684 columns(index).column_use = VariableUse::Input;
4685 columns(index).type = ColumnType::Numeric;
4686 }
4687
4688 columns(new_variables_number-1).name = "column_" + to_string(new_variables_number);
4689 columns(new_variables_number-1).column_use = VariableUse::Target;
4690 columns(new_variables_number-1).type = ColumnType::Numeric;
4691
4692 samples_uses.resize(new_samples_number);
4694}
4695
4696
4702
4703void DataSet::set(const Index& new_samples_number,
4704 const Index& new_inputs_number,
4705 const Index& new_targets_number)
4706{
4707
4708 data_file_name = "";
4709
4710 const Index new_variables_number = new_inputs_number + new_targets_number;
4711
4712 data.resize(new_samples_number, new_variables_number);
4713
4714 columns.resize(new_variables_number);
4715
4716 for(Index i = 0; i < new_variables_number; i++)
4717 {
4718 if(i < new_inputs_number)
4719 {
4720 columns(i).name = "column_" + to_string(i+1);
4721 columns(i).column_use = VariableUse::Input;
4722 columns(i).type = ColumnType::Numeric;
4723 }
4724 else
4725 {
4726 columns(i).name = "column_" + to_string(i+1);
4727 columns(i).column_use = VariableUse::Target;
4728 columns(i).type = ColumnType::Numeric;
4729 }
4730 }
4731
4732 input_variables_dimensions.resize(1);
4733
4734 samples_uses.resize(new_samples_number);
4735
4737}
4738
4739
4742
4743void DataSet::set(const DataSet& other_data_set)
4744{
4745 data_file_name = other_data_set.data_file_name;
4746
4747 has_columns_names = other_data_set.has_columns_names;
4748
4749 separator = other_data_set.separator;
4750
4752
4753 data = other_data_set.data;
4754
4755 columns = other_data_set.columns;
4756
4757 display = other_data_set.display;
4758}
4759
4760
4763
4764void DataSet::set(const tinyxml2::XMLDocument& data_set_document)
4765{
4766 set_default();
4767
4768 from_XML(data_set_document);
4769}
4770
4771
4774
4775void DataSet::set(const string& file_name)
4776{
4777 load(file_name);
4778}
4779
4784
4785void DataSet::set_display(const bool& new_display)
4786{
4787 display = new_display;
4788}
4789
4790
4795
4797{
4798 delete non_blocking_thread_pool;
4799 delete thread_pool_device;
4800
4801 const int n = omp_get_max_threads();
4802 non_blocking_thread_pool = new NonBlockingThreadPool(n);
4803 thread_pool_device = new ThreadPoolDevice(non_blocking_thread_pool, n);
4804
4805 has_columns_names = false;
4806
4807 separator = Separator::Comma;
4808
4809 missing_values_label = "NA";
4810
4811 lags_number = 0;
4812
4813 steps_ahead = 0;
4814
4816
4818
4819 input_variables_dimensions.resize(1);
4820
4821 input_variables_dimensions.setConstant(get_input_variables_number());
4822}
4823
4824
4830
4831void DataSet::set_data(const Tensor<type, 2>& new_data)
4832{
4833 const Index samples_number = new_data.dimension(0);
4834 const Index variables_number = new_data.dimension(1);
4835
4836 set(samples_number, variables_number);
4837
4838 data = new_data;
4839}
4840
4841void DataSet::set_time_series_data(const Tensor<type, 2>& new_data)
4842{
4843 time_series_data = new_data;
4844}
4845
4846
4847void DataSet::set_time_series_columns_number(const Index& new_variables_number)
4848{
4849 time_series_columns.resize(new_variables_number);
4850}
4851
4852
4857
4858void DataSet::set_data_file_name(const string& new_data_file_name)
4859{
4860 data_file_name = new_data_file_name;
4861}
4862
4863
4865
4866void DataSet::set_has_columns_names(const bool& new_has_columns_names)
4867{
4868 has_columns_names = new_has_columns_names;
4869}
4870
4871
4873
4874void DataSet::set_has_rows_label(const bool& new_has_rows_label)
4875{
4876 has_rows_labels = new_has_rows_label;
4877}
4878
4879
4882
4883void DataSet::set_separator(const Separator& new_separator)
4884{
4885 separator = new_separator;
4886}
4887
4888
4891
4892void DataSet::set_separator(const char& new_separator)
4893{
4894 if(new_separator == ' ')
4895 {
4896 separator = Separator::Space;
4897 }
4898 else if(new_separator == '\t')
4899 {
4900 separator = Separator::Tab;
4901 }
4902 else if(new_separator == ',')
4903 {
4904 separator = Separator::Comma;
4905 }
4906 else if(new_separator == ';')
4907 {
4908 separator = Separator::Semicolon;
4909 }
4910 else
4911 {
4912 ostringstream buffer;
4913
4914 buffer << "OpenNN Exception: DataSet class.\n"
4915 << "void set_separator(const char&) method.\n"
4916 << "Unknown separator: " << new_separator << ".\n";
4917
4918 throw logic_error(buffer.str());
4919 }
4920}
4921
4922
4925
4926void DataSet::set_separator(const string& new_separator_string)
4927{
4928 if(new_separator_string == "Space")
4929 {
4930 separator = Separator::Space;
4931 }
4932 else if(new_separator_string == "Tab")
4933 {
4934 separator = Separator::Tab;
4935 }
4936 else if(new_separator_string == "Comma")
4937 {
4938 separator = Separator::Comma;
4939 }
4940 else if(new_separator_string == "Semicolon")
4941 {
4942 separator = Separator::Semicolon;
4943 }
4944 else
4945 {
4946 ostringstream buffer;
4947
4948 buffer << "OpenNN Exception: DataSet class.\n"
4949 << "void set_separator(const string&) method.\n"
4950 << "Unknown separator: " << new_separator_string << ".\n";
4951
4952 throw logic_error(buffer.str());
4953 }
4954}
4955
4956
4957
4960
4961void DataSet::set_missing_values_label(const string& new_missing_values_label)
4962{
4963#ifdef OPENNN_DEBUG
4964
4965 if(get_trimmed(new_missing_values_label).empty())
4966 {
4967 ostringstream buffer;
4968
4969 buffer << "OpenNN Exception: DataSet class.\n"
4970 << "void set_missing_values_label(const string&) method.\n"
4971 << "Missing values label cannot be empty.\n";
4972
4973 throw logic_error(buffer.str());
4974 }
4975
4976#endif
4977
4978 missing_values_label = new_missing_values_label;
4979}
4980
4981
4984
4986{
4987 missing_values_method = new_missing_values_method;
4988}
4989
4990
4991void DataSet::set_missing_values_method(const string & new_missing_values_method)
4992{
4993 if(new_missing_values_method == "Unuse")
4994 {
4995 missing_values_method = MissingValuesMethod::Unuse;
4996 }
4997 else if(new_missing_values_method == "Mean")
4998 {
4999 missing_values_method = MissingValuesMethod::Mean;
5000 }
5001 else if(new_missing_values_method == "Median")
5002 {
5003 missing_values_method = MissingValuesMethod::Median;
5004 }
5005 else
5006 {
5007 ostringstream buffer;
5008
5009 buffer << "OpenNN Exception: DataSet class.\n"
5010 << "void set_missing_values_method(const string & method.\n"
5011 << "Not known method type.\n";
5012
5013 throw logic_error(buffer.str());
5014 }
5015}
5016
5017
5021
5022void DataSet::set_lags_number(const Index& new_lags_number)
5023{
5024 lags_number = new_lags_number;
5025}
5026
5027
5031
5032void DataSet::set_steps_ahead_number(const Index& new_steps_ahead_number)
5033{
5034 steps_ahead = new_steps_ahead_number;
5035}
5036
5037
5040
5041void DataSet::set_time_column(const string& new_time_column)
5042{
5043 time_column = new_time_column;
5044}
5045
5046
5047void DataSet::set_threads_number(const int& new_threads_number)
5048{
5049 if(non_blocking_thread_pool != nullptr) delete non_blocking_thread_pool;
5050 if(thread_pool_device != nullptr) delete thread_pool_device;
5051
5052 non_blocking_thread_pool = new NonBlockingThreadPool(new_threads_number);
5053 thread_pool_device = new ThreadPoolDevice(non_blocking_thread_pool, new_threads_number);
5054}
5055
5056
5061
5062void DataSet::set_samples_number(const Index& new_samples_number)
5063{
5064 const Index variables_number = get_variables_number();
5065
5066 set(new_samples_number,variables_number);
5067}
5068
5069
5072
5074{
5075 const Index columns_number = get_columns_number();
5076
5077#ifdef OPENNN_DEBUG
5078
5079 if(columns_number == 0)
5080 {
5081 ostringstream buffer;
5082
5083 buffer << "OpenNN Exception: DataSet class.\n"
5084 << "Tensor<string, 1> unuse_constant_columns() method.\n"
5085 << "Number of columns is zero.\n";
5086
5087 throw logic_error(buffer.str());
5088 }
5089
5090#endif
5091
5092 Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5093
5094 Tensor<string, 1> constant_columns(0);
5095
5096 for(Index i = 0; i < columns_number; i++)
5097 {
5098 if(columns(i).type == ColumnType::Constant)
5099 {
5100 columns(i).set_use(VariableUse::UnusedVariable);
5101 constant_columns = push_back(constant_columns, columns(i).name);
5102 }
5103 }
5104
5105 return constant_columns;
5106}
5107
5108
5111
5113{
5114 const Index samples_number = get_samples_number();
5115
5116#ifdef OPENNN_DEBUG
5117
5118 if(samples_number == 0)
5119 {
5120 ostringstream buffer;
5121
5122 buffer << "OpenNN Exception: DataSet class.\n"
5123 << "Tensor<Index, 1> unuse_repeated_samples() method.\n"
5124 << "Number of samples is zero.\n";
5125
5126 throw logic_error(buffer.str());
5127 }
5128
5129#endif
5130
5131 Tensor<Index, 1> repeated_samples;
5132
5133 Tensor<type, 1> sample_i;
5134 Tensor<type, 1> sample_j;
5135
5136 #pragma omp parallel for private(sample_i, sample_j) schedule(dynamic)
5137
5138 for(Index i = 0; i < static_cast<Index>(samples_number); i++)
5139 {
5140 sample_i = get_sample_data(i);
5141
5142 for(Index j = static_cast<Index>(i+1); j < samples_number; j++)
5143 {
5144 sample_j = get_sample_data(j);
5145
5146 if(get_sample_use(j) != SampleUse::UnusedSample
5147 && equal(sample_i.data(), sample_i.data()+sample_i.size(), sample_j.data()))
5148 {
5149 set_sample_use(j, SampleUse::UnusedSample);
5150
5151 repeated_samples = push_back(repeated_samples, j);
5152 }
5153 }
5154 }
5155
5156 return repeated_samples;
5157}
5158
5159
5162
5163Tensor<string, 1> DataSet::unuse_uncorrelated_columns(const type& minimum_correlation)
5164{
5165 Tensor<string, 1> unused_columns;
5166
5167 const Tensor<Correlation, 2> correlations = calculate_input_target_columns_correlations();
5168
5169 const Index input_columns_number = get_input_columns_number();
5170 const Index target_columns_number = get_target_columns_number();
5171
5172 const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
5173
5174 for(Index i = 0; i < input_columns_number; i++)
5175 {
5176 const Index input_column_index = input_columns_indices(i);
5177
5178 for(Index j = 0; j < target_columns_number; j++)
5179 {
5180 if(!isnan(correlations(i,j).r)
5181 && abs(correlations(i,j).r) < minimum_correlation
5182 && columns(input_column_index).column_use != VariableUse::UnusedVariable)
5183 {
5184 columns(input_column_index).set_use(VariableUse::UnusedVariable);
5185
5186 unused_columns = push_back(unused_columns, columns(input_column_index).name);
5187 }
5188 }
5189 }
5190
5191 return unused_columns;
5192}
5193
5194
5200
5201Tensor<Histogram, 1> DataSet::calculate_columns_distribution(const Index& bins_number) const
5202{
5203 const Index columns_number = columns.size();
5204 const Index used_columns_number = get_used_columns_number();
5205 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5206 const Index used_samples_number = used_samples_indices.size();
5207
5208 Tensor<Histogram, 1> histograms(used_columns_number);
5209
5210 Index variable_index = 0;
5211 Index used_column_index = 0;
5212
5213 for(Index i = 0; i < columns_number; i++)
5214 {
5215 if(columns(i).type == ColumnType::Numeric)
5216 {
5217 if(columns(i).column_use == VariableUse::UnusedVariable)
5218 {
5219 variable_index++;
5220 }
5221 else
5222 {
5223 Tensor<type, 1> column(used_samples_number);
5224
5225 for(Index j = 0; j < used_samples_number; j++)
5226 {
5227 column(j) = data(used_samples_indices(j), variable_index);
5228 }
5229
5230 histograms(used_column_index) = histogram(column, bins_number);
5231
5232 variable_index++;
5233 used_column_index++;
5234 }
5235 }
5236 else if(columns(i).type == ColumnType::Categorical)
5237 {
5238 const Index categories_number = columns(i).get_categories_number();
5239
5240 if(columns(i).column_use == VariableUse::UnusedVariable)
5241 {
5242 variable_index += categories_number;
5243 }
5244 else
5245 {
5246 Tensor<Index, 1> categories_frequencies(categories_number);
5247 categories_frequencies.setZero();
5248 Tensor<type, 1> centers(categories_number);
5249
5250 for(Index j = 0; j < categories_number; j++)
5251 {
5252 for(Index k = 0; k < used_samples_number; k++)
5253 {
5254 if(abs(data(used_samples_indices(k), variable_index) - type(1)) < type(NUMERIC_LIMITS_MIN))
5255 {
5256 categories_frequencies(j)++;
5257 }
5258 }
5259
5260 centers(j) = static_cast<type>(j);
5261
5262 variable_index++;
5263 }
5264
5265 histograms(used_column_index).frequencies = categories_frequencies;
5266 histograms(used_column_index).centers = centers;
5267
5268 used_column_index++;
5269 }
5270 }
5271 else if(columns(i).type == ColumnType::Binary)
5272 {
5273 if(columns(i).column_use == VariableUse::UnusedVariable)
5274 {
5275 variable_index++;
5276 }
5277 else
5278 {
5279 Tensor<Index, 1> binary_frequencies(2);
5280 binary_frequencies.setZero();
5281
5282 for(Index j = 0; j < used_samples_number; j++)
5283 {
5284 if(abs(data(used_samples_indices(j), variable_index) - type(1)) < type(NUMERIC_LIMITS_MIN))
5285 {
5286 binary_frequencies(0)++;
5287 }
5288 else
5289 {
5290 binary_frequencies(1)++;
5291 }
5292 }
5293
5294 histograms(used_column_index).frequencies = binary_frequencies;
5295 variable_index++;
5296 used_column_index++;
5297 }
5298 }
5299 else // Time @todo
5300 {
5301 variable_index++;
5302 }
5303 }
5304
5305 return histograms;
5306}
5307
5308
5319
5321{
5322 Index used_columns_number = get_used_columns_number();
5323
5324 Index columns_number = get_columns_number();
5325
5326 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5327
5328 Tensor<BoxPlot, 1> box_plots(used_columns_number);
5329
5330 Index used_column_index = 0;
5331 Index variable_index = 0;
5332
5333 for(Index i = 0; i < columns_number; i++)
5334 {
5335 if(columns(i).type == ColumnType::Numeric || columns(i).type == ColumnType::Binary)
5336 {
5337 if(columns(i).column_use != VariableUse::UnusedVariable)
5338 {
5339 box_plots(used_column_index) = box_plot(data.chip(variable_index, 1), used_samples_indices);
5340
5341 used_column_index++;
5342 }
5343
5344 variable_index++;
5345 }
5346 else if(columns(i).type == ColumnType::Categorical)
5347 {
5348 variable_index += columns(i).get_categories_number();
5349 }
5350 else
5351 {
5352 variable_index++;
5353 }
5354 }
5355
5356 return box_plots;
5357}
5358
5359
5362
5363Index DataSet::calculate_used_negatives(const Index& target_index) const
5364{
5365 Index negatives = 0;
5366
5367 const Tensor<Index, 1> used_indices = get_used_samples_indices();
5368
5369 const Index used_samples_number = used_indices.size();
5370
5371 for(Index i = 0; i < used_samples_number; i++)
5372 {
5373 const Index training_index = used_indices(i);
5374
5375 if(abs(data(training_index, target_index)) < type(NUMERIC_LIMITS_MIN))
5376 {
5377 negatives++;
5378 }
5379 else if(abs(data(training_index, target_index) - type(1)) > type(NUMERIC_LIMITS_MIN))
5380 {
5381 ostringstream buffer;
5382
5383 buffer << "OpenNN Exception: DataSet class.\n"
5384 << "Index calculate_used_negatives(const Index&) const method.\n"
5385 << "Training sample is neither a positive nor a negative: " << data(training_index, target_index) << endl;
5386
5387 throw logic_error(buffer.str());
5388 }
5389 }
5390
5391 return negatives;
5392}
5393
5394
5397
5398Index DataSet::calculate_training_negatives(const Index& target_index) const
5399{
5400 Index negatives = 0;
5401
5402 const Tensor<Index, 1> training_indices = get_training_samples_indices();
5403
5404 const Index training_samples_number = training_indices.size();
5405
5406 for(Index i = 0; i < training_samples_number; i++)
5407 {
5408 const Index training_index = training_indices(i);
5409
5410 if(abs(data(training_index, target_index)) < type(NUMERIC_LIMITS_MIN))
5411 {
5412 negatives++;
5413 }
5414 else if(abs(data(training_index, target_index) - static_cast<type>(1)) > static_cast<type>(1.0e-3))
5415 {
5416 ostringstream buffer;
5417
5418 buffer << "OpenNN Exception: DataSet class.\n"
5419 << "Index calculate_training_negatives(const Index&) const method.\n"
5420 << "Training sample is neither a positive nor a negative: " << data(training_index, target_index) << endl;
5421
5422 throw logic_error(buffer.str());
5423 }
5424 }
5425
5426 return negatives;
5427}
5428
5429
5432
5433Index DataSet::calculate_selection_negatives(const Index& target_index) const
5434{
5435 Index negatives = 0;
5436
5437 const Index selection_samples_number = get_selection_samples_number();
5438
5439 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
5440
5441 for(Index i = 0; i < static_cast<Index>(selection_samples_number); i++)
5442 {
5443 const Index selection_index = selection_indices(i);
5444
5445 if(abs(data(selection_index, target_index)) < type(NUMERIC_LIMITS_MIN))
5446 {
5447 negatives++;
5448 }
5449 else if(abs(data(selection_index, target_index) - type(1)) > type(NUMERIC_LIMITS_MIN))
5450 {
5451 ostringstream buffer;
5452
5453 buffer << "OpenNN Exception: DataSet class.\n"
5454 << "Index calculate_testing_negatives(const Index&) const method.\n"
5455 << "Selection sample is neither a positive nor a negative: " << data(selection_index, target_index) << endl;
5456
5457 throw logic_error(buffer.str());
5458 }
5459 }
5460
5461 return negatives;
5462}
5463
5464
5467
5468Index DataSet::calculate_testing_negatives(const Index& target_index) const
5469{
5470 Index negatives = 0;
5471
5472 const Index testing_samples_number = get_testing_samples_number();
5473
5474 const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
5475
5476 for(Index i = 0; i < static_cast<Index>(testing_samples_number); i++)
5477 {
5478 const Index testing_index = testing_indices(i);
5479
5480 if(data(testing_index, target_index) < type(NUMERIC_LIMITS_MIN))
5481 {
5482 negatives++;
5483 }
5484 }
5485
5486 return negatives;
5487}
5488
5489
5498
5499Tensor<Descriptives, 1> DataSet::calculate_variables_descriptives() const
5500{
5501 return descriptives(data);
5502}
5503
5504
5513
5515{
5516 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5517 const Tensor<Index, 1> used_variables_indices = get_used_variables_indices();
5518
5519 return descriptives(data, used_samples_indices, used_variables_indices);
5520}
5521
5522
5524
5526{
5527
5528#ifdef OPENNN_DEBUG
5529
5530 const Index targets_number = get_target_variables_number();
5531
5532 if(targets_number != 1)
5533 {
5534 ostringstream buffer;
5535
5536 buffer << "OpenNN Exception: DataSet class.\n"
5537 << "Tensor<type, 2> calculate_columns_descriptives_positive_samples() const method.\n"
5538 << "Number of targets muste be 1.\n";
5539
5540 throw logic_error(buffer.str());
5541 }
5542#endif
5543
5544 const Index target_index = get_target_variables_indices()(0);
5545
5546 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5547 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
5548
5549 const Index samples_number = used_samples_indices.size();
5550
5551 // Count used positive samples
5552
5553 Index positive_samples_number = 0;
5554
5555 for(Index i = 0; i < samples_number; i++)
5556 {
5557 Index sample_index = used_samples_indices(i);
5558
5559 if(abs(data(sample_index, target_index) - type(1)) < type(NUMERIC_LIMITS_MIN)) positive_samples_number++;
5560 }
5561
5562 // Get used positive samples indices
5563
5564 Tensor<Index, 1> positive_used_samples_indices(positive_samples_number);
5565 Index positive_sample_index = 0;
5566
5567 for(Index i = 0; i < samples_number; i++)
5568 {
5569 Index sample_index = used_samples_indices(i);
5570
5571 if(abs(data(sample_index, target_index) - type(1)) < type(NUMERIC_LIMITS_MIN))
5572 {
5573 positive_used_samples_indices(positive_sample_index) = sample_index;
5574 positive_sample_index++;
5575 }
5576 }
5577
5578 return descriptives(data, positive_used_samples_indices, input_variables_indices);
5579}
5580
5581
5583
5585{
5586
5587#ifdef OPENNN_DEBUG
5588
5589 const Index targets_number = get_target_variables_number();
5590
5591 if(targets_number != 1)
5592 {
5593 ostringstream buffer;
5594
5595 buffer << "OpenNN Exception: DataSet class.\n"
5596 << "Tensor<type, 2> calculate_columns_descriptives_positive_samples() const method.\n"
5597 << "Number of targets muste be 1.\n";
5598
5599 throw logic_error(buffer.str());
5600 }
5601#endif
5602
5603 const Index target_index = get_target_variables_indices()(0);
5604
5605 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5606 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
5607
5608 const Index samples_number = used_samples_indices.size();
5609
5610 // Count used negative samples
5611
5612 Index negative_samples_number = 0;
5613
5614 for(Index i = 0; i < samples_number; i++)
5615 {
5616 Index sample_index = used_samples_indices(i);
5617
5618 if(data(sample_index, target_index) < type(NUMERIC_LIMITS_MIN)) negative_samples_number++;
5619 }
5620
5621 // Get used negative samples indices
5622
5623 Tensor<Index, 1> negative_used_samples_indices(negative_samples_number);
5624 Index negative_sample_index = 0;
5625
5626 for(Index i = 0; i < samples_number; i++)
5627 {
5628 Index sample_index = used_samples_indices(i);
5629
5630 if(data(sample_index, target_index) < type(NUMERIC_LIMITS_MIN))
5631 {
5632 negative_used_samples_indices(negative_sample_index) = sample_index;
5633 negative_sample_index++;
5634 }
5635
5636 }
5637
5638 return descriptives(data, negative_used_samples_indices, input_variables_indices);
5639}
5640
5641
5644
5645Tensor<Descriptives, 1> DataSet::calculate_columns_descriptives_categories(const Index& class_index) const
5646{
5647 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5648 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
5649
5650 const Index samples_number = used_samples_indices.size();
5651
5652 // Count used class samples
5653
5654 Index class_samples_number = 0;
5655
5656 for(Index i = 0; i < samples_number; i++)
5657 {
5658 Index sample_index = used_samples_indices(i);
5659
5660 if(abs(data(sample_index, class_index) - type(1)) < type(NUMERIC_LIMITS_MIN)) class_samples_number++;
5661 }
5662
5663 // Get used class samples indices
5664
5665 Tensor<Index, 1> class_used_samples_indices(class_samples_number);
5666 class_used_samples_indices.setZero();
5667 Index class_sample_index = 0;
5668
5669 for(Index i = 0; i < samples_number; i++)
5670 {
5671 Index sample_index = used_samples_indices(i);
5672
5673 if(abs(data(sample_index, class_index) - type(1)) < type(NUMERIC_LIMITS_MIN))
5674 {
5675 class_used_samples_indices(class_sample_index) = sample_index;
5676 class_sample_index++;
5677 }
5678 }
5679
5680 return descriptives(data, class_used_samples_indices, input_variables_indices);
5681}
5682
5683
5692
5694{
5695 const Tensor<Index, 1> training_indices = get_training_samples_indices();
5696
5697 const Tensor<Index, 1> used_indices = get_used_columns_indices();
5698
5699 return descriptives(data, training_indices, used_indices);
5700}
5701
5702
5711
5713{
5714 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
5715
5716 const Tensor<Index, 1> used_indices = get_used_columns_indices();
5717
5718 return descriptives(data, selection_indices, used_indices);
5719}
5720
5721
5725
5727{
5728 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5729
5730 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
5731
5732 return descriptives(data, used_samples_indices, input_variables_indices);
5733}
5734
5735
5744
5746{
5747 const Tensor<Index, 1> used_indices = get_used_samples_indices();
5748
5749 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
5750
5751 return descriptives(data, used_indices, target_variables_indices);
5752}
5753
5754
5755Tensor<Descriptives, 1> DataSet::calculate_testing_target_variables_descriptives() const
5756{
5757 const Tensor<Index, 1> testing_indices = get_testing_samples_indices();
5758
5759 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
5760
5761 return descriptives(data, testing_indices, target_variables_indices);
5762}
5763
5764
5766
5768{
5769 return columns_minimums(data, get_used_samples_indices(), get_input_variables_indices());
5770}
5771
5772
5774
5776{
5777 return columns_minimums(data, get_used_samples_indices(), get_target_variables_indices());
5778}
5779
5780
5781
5783
5785{
5786 return columns_maximums(data, get_used_samples_indices(), get_input_variables_indices());
5787}
5788
5789
5791
5793{
5794 return columns_maximums(data, get_used_samples_indices(), get_target_variables_indices());
5795}
5796
5797
5799
5801{
5802 return columns_minimums(data, get_used_samples_indices(), get_used_variables_indices());
5803}
5804
5805
5808
5809Tensor<type, 1> DataSet::calculate_variables_means(const Tensor<Index, 1>& variables_indices) const
5810{
5811 const Index variables_number = variables_indices.size();
5812
5813 Tensor<type, 1> means(variables_number);
5814 means.setZero();
5815
5816 #pragma omp parallel for
5817
5818 for(Index i = 0; i < variables_number; i++)
5819 {
5820 const Index variable_index = variables_indices(i);
5821
5822 Tensor<type, 0> mean = data.chip(variable_index, 1).mean();
5823
5824 means(i) = mean(0);
5825 }
5826
5827 return means;
5828}
5829
5830
5831Tensor<type, 1> DataSet::calculate_used_targets_mean() const
5832{
5833 const Tensor<Index, 1> used_indices = get_used_samples_indices();
5834
5835 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
5836
5837 return mean(data, used_indices, target_variables_indices);
5838}
5839
5840
5842
5844{
5845 const Tensor<Index, 1> selection_indices = get_selection_samples_indices();
5846
5847 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
5848
5849 return mean(data, selection_indices, target_variables_indices);
5850}
5851
5852
5855
5856Index DataSet::get_gmt() const
5857{
5858 return gmt;
5859}
5860
5861
5864
5865void DataSet::set_gmt(Index& new_gmt)
5866{
5867 gmt = new_gmt;
5868}
5869
5870
5875
5877{
5878 const Index input_columns_number = get_input_columns_number();
5879 const Index target_columns_number = get_target_columns_number();
5880
5881 const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
5882 const Tensor<Index, 1> target_columns_indices = get_target_columns_indices();
5883
5884 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
5885
5886 Tensor<Correlation, 2> correlations(input_columns_number, target_columns_number);
5887
5888 for(Index i = 0; i < input_columns_number; i++)
5889 {
5890 const Index input_index = input_columns_indices(i);
5891
5892 const Tensor<type, 2> input_column_data = get_column_data(input_index, used_samples_indices);
5893
5894 for(Index j = 0; j < target_columns_number; j++)
5895 {
5896 const Index target_index = target_columns_indices(j);
5897
5898 const Tensor<type, 2> target_column_data = get_column_data(target_index, used_samples_indices);
5899
5900 correlations(i,j) = OpenNN::correlation(thread_pool_device, input_column_data, target_column_data);
5901
5902 cout << columns(input_index).name << " - " << columns(target_index).name << " correlation: " << correlations(i,j).r << endl;
5903 }
5904 }
5905
5906 return correlations;
5907}
5908
5909
5911
5913{
5914 for(Index i = 0; i < data.size(); i++) if(isnan(data(i))) return true;
5915
5916 return false;
5917}
5918
5919
5921
5922bool DataSet::has_nan_row(const Index& row_index) const
5923{
5924 for(Index j = 0; j < data.dimension(1); j++)
5925 {
5926 if(isnan(data(row_index,j))) return true;
5927 }
5928
5929 return false;
5930}
5931
5932
5939
5941{
5942 const Index missing_values_number = count_nan();
5943
5944 cout << "Missing values number: " << missing_values_number << " (" << missing_values_number*100/data.size() << "%)" << endl;
5945
5946 const Tensor<Index, 0> columns_with_missing_values = count_nan_columns().sum();
5947
5948 cout << "Columns with missing values: " << columns_with_missing_values(0)
5949 << " (" << columns_with_missing_values(0)*100/data.dimension(1) << "%)" << endl;
5950
5951 const Index samples_with_missing_values = count_rows_with_nan();
5952
5953 cout << "Samples with missing values: "
5954 << samples_with_missing_values << " (" << samples_with_missing_values*100/data.dimension(0) << "%)" << endl;
5955}
5956
5957
5959
5961{
5962 const Index inputs_number = get_input_variables_number();
5963 const Index targets_number = get_target_variables_number();
5964
5965 const Tensor<string, 1> inputs_names = get_input_variables_names();
5966 const Tensor<string, 1> targets_name = get_target_variables_names();
5967
5968 const Tensor<Correlation, 2> correlations = calculate_input_target_columns_correlations();
5969
5970 for(Index j = 0; j < targets_number; j++)
5971 {
5972 for(Index i = 0; i < inputs_number; i++)
5973 {
5974 cout << targets_name(j) << " - " << inputs_names(i) << ": " << correlations(i,j).r << endl;
5975 }
5976 }
5977}
5978
5979
5982
5984{
5985 const Index inputs_number = get_input_columns_number();
5986 const Index targets_number = get_target_columns_number();
5987
5988 const Tensor<string, 1> inputs_names = get_input_variables_names();
5989 const Tensor<string, 1> targets_name = get_target_variables_names();
5990
5991 const Tensor<type, 2> correlations = get_correlation_values(calculate_input_target_columns_correlations());
5992
5993 Tensor<type, 1> target_correlations(inputs_number);
5994
5995 Tensor<string, 2> top_correlations(inputs_number, 2);
5996
5997 map<type,string> top_correlation;
5998
5999 for(Index i = 0 ; i < inputs_number; i++)
6000 {
6001 for(Index j = 0 ; j < targets_number ; j++)
6002 {
6003 top_correlation.insert(pair<type,string>(correlations(i,j), inputs_names(i) + " - " + targets_name(j)));
6004 }
6005 }
6006
6007 map<type,string>::iterator it;
6008
6009 for(it = top_correlation.begin(); it != top_correlation.end(); it++)
6010 {
6011 cout << "Correlation: " << (*it).first << " between " << (*it).second << "" << endl;
6012 }
6013}
6014
6015
6018
6020{
6021 const Tensor<Index, 1> input_columns_indices = get_input_columns_indices();
6022
6023 const Index input_columns_number = get_input_columns_number();
6024
6025 Tensor<Correlation, 2> correlations(input_columns_number, input_columns_number);
6026
6027 for(Index i = 0; i < input_columns_number; i++)
6028 {
6029 const Index current_input_index_i = input_columns_indices(i);
6030
6031 const Tensor<type, 2> input_i = get_column_data(current_input_index_i);
6032
6033 cout << "Calculating " << columns(current_input_index_i).name << " correlations. " << endl;
6034
6035 for(Index j = i; j < input_columns_number; j++)
6036 {
6037 const Index current_input_index_j = input_columns_indices(j);
6038
6039 const Tensor<type, 2> input_j = get_column_data(current_input_index_j);
6040
6041 correlations(i,j) = OpenNN::correlation(thread_pool_device, input_i, input_j);
6042
6043 if(correlations(i,j).r > type(1))
6044 {
6045 correlations(i,j).r = type(1);
6046 }
6047 }
6048 }
6049
6050 for(Index i = 0; i < input_columns_number; i++)
6051 {
6052 for(Index j = 0; j < i; j++)
6053 {
6054 correlations(i,j) = correlations(j,i);
6055 }
6056 }
6057
6058 return correlations;
6059}
6060
6061
6063
6065{
6066 const Tensor<type, 2> inputs_correlations = get_correlation_values(calculate_input_columns_correlations());
6067
6068 cout << inputs_correlations << endl;
6069}
6070
6071
6072void DataSet::print_data_file_preview() const
6073{
6074 const Index size = data_file_preview.size();
6075
6076 for(Index i = 0; i < size; i++)
6077 {
6078 for(Index j = 0; j < data_file_preview(i).size(); j++)
6079 {
6080 cout << data_file_preview(i)(j) << " ";
6081 }
6082
6083 cout << endl;
6084 }
6085}
6086
6087
6090
6092{
6093 const Index variables_number = get_input_variables_number();
6094
6095 const Tensor<string, 1> variables_name = get_input_variables_names();
6096
6097 const Tensor<type, 2> variables_correlations = get_correlation_values(calculate_input_columns_correlations());
6098
6099 const Index correlations_number = variables_number*(variables_number-1)/2;
6100
6101 Tensor<string, 2> top_correlations(correlations_number, 3);
6102
6103 map<type, string> top_correlation;
6104
6105 for(Index i = 0; i < variables_number; i++)
6106 {
6107 for(Index j = i; j < variables_number; j++)
6108 {
6109 if(i == j) continue;
6110
6111 top_correlation.insert(pair<type,string>(variables_correlations(i,j), variables_name(i) + " - " + variables_name(j)));
6112 }
6113 }
6114
6115 map<type,string> ::iterator it;
6116
6117 for(it = top_correlation.begin(); it != top_correlation.end(); it++)
6118 {
6119 cout << "Correlation: " << (*it).first << " between " << (*it).second << "" << endl;
6120 }
6121}
6122
6123
6127
6129{
6130 const Index columns_number = columns.size();
6131
6132 for(Index i = 0; i < columns_number; i++)
6133 {
6134 if(columns(i).type == ColumnType::Numeric)
6135 {
6136 columns(i).scaler = Scaler::MeanStandardDeviation;
6137 }
6138 else
6139 {
6140 columns(i).scaler = Scaler::MinimumMaximum;
6141 }
6142 }
6143}
6144
6145
6146Tensor<Descriptives, 1> DataSet::scale_data()
6147{
6148 const Index variables_number = get_variables_number();
6149
6150 const Tensor<Descriptives, 1> variables_descriptives = calculate_variables_descriptives();
6151
6152 Index column_index;
6153
6154 for(Index i = 0; i < variables_number; i++)
6155 {
6156 column_index = get_column_index(i);
6157
6158 switch(columns(column_index).scaler)
6159 {
6160 case Scaler::NoScaling:
6161 // Do nothing
6162 break;
6163
6164 case Scaler::MinimumMaximum:
6165 scale_minimum_maximum(data, i, variables_descriptives(i));
6166 break;
6167
6168 case Scaler::MeanStandardDeviation:
6169 scale_mean_standard_deviation(data, i, variables_descriptives(i));
6170 break;
6171
6172 case Scaler::StandardDeviation:
6173 scale_standard_deviation(data, i, variables_descriptives(i));
6174 break;
6175
6176 case Scaler::Logarithm:
6177 scale_logarithmic(data, i);
6178 break;
6179
6180 default:
6181 {
6182 ostringstream buffer;
6183
6184 buffer << "OpenNN Exception: DataSet class\n"
6185 << "void scale_data() method.\n"
6186 << "Unknown scaler: " << int(columns(i).scaler) << "\n";
6187
6188 throw logic_error(buffer.str());
6189 }
6190 }
6191 }
6192
6193 return variables_descriptives;
6194}
6195
6196
6197void DataSet::unscale_data(const Tensor<Descriptives, 1>& variables_descriptives)
6198{
6199 const Index variables_number = get_variables_number();
6200
6201 for(Index i = 0; i < variables_number; i++)
6202 {
6203 switch(columns(i).scaler)
6204 {
6205 case Scaler::NoScaling:
6206 // Do nothing
6207 break;
6208
6209 case Scaler::MinimumMaximum:
6210 unscale_minimum_maximum(data, i, variables_descriptives(i));
6211 break;
6212
6213 case Scaler::MeanStandardDeviation:
6214 unscale_mean_standard_deviation(data, i, variables_descriptives(i));
6215 break;
6216
6217 case Scaler::StandardDeviation:
6218 unscale_standard_deviation(data, i, variables_descriptives(i));
6219 break;
6220
6221 case Scaler::Logarithm:
6222 unscale_logarithmic(data, i);
6223 break;
6224
6225 default:
6226 {
6227 ostringstream buffer;
6228
6229 buffer << "OpenNN Exception: DataSet class\n"
6230 << "void unscale_data() method.\n"
6231 << "Unknown scaler: " << int(columns(i).scaler) << "\n";
6232
6233 throw logic_error(buffer.str());
6234 }
6235 }
6236 }
6237}
6238
6239
6242
6243Tensor<Descriptives, 1> DataSet::scale_input_variables()
6244{
6245 const Index input_variables_number = get_input_variables_number();
6246
6247 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
6248 const Tensor<Scaler, 1> input_variables_scalers = get_input_variables_scalers();
6249
6250 const Tensor<Descriptives, 1> input_variables_descriptives = calculate_input_variables_descriptives();
6251
6252 for(Index i = 0; i < input_variables_number; i++)
6253 {
6254 switch(input_variables_scalers(i))
6255 {
6256 case Scaler::NoScaling:
6257 // Do nothing
6258 break;
6259
6260 case Scaler::MinimumMaximum:
6261 scale_minimum_maximum(data, input_variables_indices(i), input_variables_descriptives(i));
6262 break;
6263
6264 case Scaler::MeanStandardDeviation:
6265 scale_mean_standard_deviation(data, input_variables_indices(i), input_variables_descriptives(i));
6266 break;
6267
6268 case Scaler::StandardDeviation:
6269 scale_standard_deviation(data, input_variables_indices(i), input_variables_descriptives(i));
6270 break;
6271
6272 case Scaler::Logarithm:
6273 scale_logarithmic(data, input_variables_indices(i));
6274 break;
6275
6276 default:
6277 {
6278 ostringstream buffer;
6279
6280 buffer << "OpenNN Exception: DataSet class\n"
6281 << "void scale_input_variables(const Tensor<string, 1>&, const Tensor<Descriptives, 1>&) method.\n"
6282 << "Unknown scaling and unscaling method: " << int(input_variables_scalers(i)) << "\n";
6283
6284 throw logic_error(buffer.str());
6285 }
6286 }
6287 }
6288
6289 return input_variables_descriptives;
6290}
6291
6292
6297
6298Tensor<Descriptives, 1> DataSet::scale_target_variables()
6299{
6300 const Index target_variables_number = get_target_variables_number();
6301
6302 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
6303 const Tensor<Scaler, 1> target_variables_scalers = get_target_variables_scalers();
6304
6305 const Tensor<Descriptives, 1> target_variables_descriptives = calculate_target_variables_descriptives();
6306
6307 for(Index i = 0; i < target_variables_number; i++)
6308 {
6309 switch(target_variables_scalers(i))
6310 {
6311 case Scaler::NoScaling:
6312 // Do nothing
6313 break;
6314
6315 case Scaler::MinimumMaximum:
6316 scale_minimum_maximum(data, target_variables_indices(i), target_variables_descriptives(i));
6317 break;
6318
6319 case Scaler::MeanStandardDeviation:
6320 scale_mean_standard_deviation(data, target_variables_indices(i), target_variables_descriptives(i));
6321 break;
6322
6323 case Scaler::StandardDeviation:
6324 scale_standard_deviation(data, target_variables_indices(i), target_variables_descriptives(i));
6325 break;
6326
6327 case Scaler::Logarithm:
6328 scale_logarithmic(data, target_variables_indices(i));
6329 break;
6330
6331 default:
6332 {
6333 ostringstream buffer;
6334
6335 buffer << "OpenNN Exception: DataSet class\n"
6336 << "void scale_input_variables(const Tensor<string, 1>&, const Tensor<Descriptives, 1>&) method.\n"
6337 << "Unknown scaling and unscaling method: " << int(target_variables_scalers(i)) << "\n";
6338
6339 throw logic_error(buffer.str());
6340 }
6341 }
6342 }
6343
6344 return target_variables_descriptives;
6345}
6346
6347
6350
6351void DataSet::unscale_input_variables(const Tensor<Descriptives, 1>& input_variables_descriptives)
6352{
6353 const Index input_variables_number = get_input_variables_number();
6354
6355 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
6356
6357 const Tensor<Scaler, 1> input_variables_scalers = get_input_variables_scalers();
6358
6359 for(Index i = 0; i < input_variables_number; i++)
6360 {
6361 switch(input_variables_scalers(i))
6362 {
6363 case Scaler::NoScaling:
6364 // Do nothing
6365 break;
6366
6367 case Scaler::MinimumMaximum:
6368 unscale_minimum_maximum(data, input_variables_indices(i), input_variables_descriptives(i));
6369 break;
6370
6371 case Scaler::MeanStandardDeviation:
6372 unscale_mean_standard_deviation(data, input_variables_indices(i), input_variables_descriptives(i));
6373 break;
6374
6375 case Scaler::StandardDeviation:
6376 unscale_standard_deviation(data, input_variables_indices(i), input_variables_descriptives(i));
6377 break;
6378
6379 default:
6380 {
6381 ostringstream buffer;
6382
6383 buffer << "OpenNN Exception: DataSet class\n"
6384 << "void unscale_input_variables(const Tensor<string, 1>&, const Tensor<Descriptives, 1>&) method.\n"
6385 << "Unknown unscaling and unscaling method: " << int(input_variables_scalers(i)) << "\n";
6386
6387 throw logic_error(buffer.str());
6388 }
6389 }
6390 }
6391}
6392
6393
6396
6397void DataSet::unscale_target_variables(const Tensor<Descriptives, 1>& targets_descriptives)
6398{
6399 const Index target_variables_number = get_target_variables_number();
6400 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
6401 const Tensor<Scaler, 1> target_variables_scalers = get_target_variables_scalers();
6402
6403 for(Index i = 0; i < target_variables_number; i++)
6404 {
6405 switch(target_variables_scalers(i))
6406 {
6407 case Scaler::NoScaling:
6408 break;
6409
6410 case Scaler::MinimumMaximum:
6411 unscale_minimum_maximum(data, target_variables_indices(i), targets_descriptives(i));
6412 break;
6413
6414 case Scaler::MeanStandardDeviation:
6415 unscale_mean_standard_deviation(data, target_variables_indices(i), targets_descriptives(i));
6416 break;
6417
6418 case Scaler::Logarithm:
6419 unscale_logarithmic(data, target_variables_indices(i));
6420 break;
6421
6422 default:
6423 {
6424 ostringstream buffer;
6425
6426 buffer << "OpenNN Exception: DataSet class\n"
6427 << "void unscale_targets(const Tensor<Descriptives, 1>&) method.\n"
6428 << "Unknown unscaling and unscaling method.\n";
6429
6430 throw logic_error(buffer.str());
6431 }
6432 }
6433 }
6434}
6435
6436
6439
6440void DataSet::set_data_constant(const type& new_value)
6441{
6442 data.setConstant(new_value);
6443}
6444
6445
6448
6450{
6451 data.setRandom();
6452}
6453
6454
6457
6459{
6460 data.setRandom();
6461
6462 const Index samples_number = data.dimension(0);
6463 const Index variables_number = data.dimension(1);
6464
6465 const Index input_variables_number = get_input_variables_number();
6466 const Index target_variables_number = variables_number - input_variables_number;
6467
6468 Index target_variable_index = 0;
6469
6470 for(Index i = 0; i < samples_number; i++)
6471 {
6472 if(target_variables_number == 1) target_variable_index = rand()%2;
6473 else target_variable_index = rand()%(variables_number-input_variables_number)+input_variables_number;
6474
6475 for(Index j = input_variables_number; j < variables_number; j++)
6476 {
6477 if(target_variables_number == 1) data(i,j) = static_cast<type>(target_variable_index);
6478 else data(i,j) = (j == target_variable_index) ? type(1) : type(0);
6479 }
6480 }
6481}
6482
6483
6485
6487{
6488 ostringstream buffer;
6489
6490 file_stream.OpenElement("DataSet");
6491
6492 // Data file
6493
6494 file_stream.OpenElement("DataFile");
6495
6496 // File type ?
6497 {
6498 file_stream.OpenElement("FileType");
6499
6500 file_stream.PushText("csv");
6501
6502 file_stream.CloseElement();
6503 }
6504
6505 // Data file name
6506 {
6507 file_stream.OpenElement("DataFileName");
6508
6509 file_stream.PushText(data_file_name.c_str());
6510
6511 file_stream.CloseElement();
6512 }
6513
6514 // Separator
6515 {
6516 file_stream.OpenElement("Separator");
6517
6518 file_stream.PushText(get_separator_string().c_str());
6519
6520 file_stream.CloseElement();
6521 }
6522
6523 // Columns names
6524 {
6525 file_stream.OpenElement("ColumnsNames");
6526
6527 buffer.str("");
6528 buffer << has_columns_names;
6529
6530 file_stream.PushText(buffer.str().c_str());
6531
6532 file_stream.CloseElement();
6533 }
6534
6535 // Rows labels
6536 {
6537 file_stream.OpenElement("RowsLabels");
6538
6539 buffer.str("");
6540
6541 buffer << has_rows_labels;
6542
6543 file_stream.PushText(buffer.str().c_str());
6544
6545 file_stream.CloseElement();
6546 }
6547
6548 // Missing values label
6549 {
6550 file_stream.OpenElement("MissingValuesLabel");
6551
6552 file_stream.PushText(missing_values_label.c_str());
6553
6554 file_stream.CloseElement();
6555 }
6556
6557 // Lags number
6558 {
6559 file_stream.OpenElement("LagsNumber");
6560
6561 buffer.str("");
6562 buffer << get_lags_number();
6563
6564 file_stream.PushText(buffer.str().c_str());
6565
6566 file_stream.CloseElement();
6567 }
6568
6569 // Steps Ahead
6570 {
6571 file_stream.OpenElement("StepsAhead");
6572
6573 buffer.str("");
6574 buffer << get_steps_ahead();
6575
6576 file_stream.PushText(buffer.str().c_str());
6577
6578 file_stream.CloseElement();
6579 }
6580
6581 // Time Column
6582 {
6583 file_stream.OpenElement("TimeColumn");
6584
6585 buffer.str("");
6586 buffer << get_time_column();
6587
6588 file_stream.PushText(buffer.str().c_str());
6589
6590 file_stream.CloseElement();
6591 }
6592 // Close DataFile
6593
6594 file_stream.CloseElement();
6595
6596 // Columns
6597
6598 file_stream.OpenElement("Columns");
6599
6600 // Columns number
6601 {
6602 file_stream.OpenElement("ColumnsNumber");
6603
6604 buffer.str("");
6605 buffer << get_columns_number();
6606
6607 file_stream.PushText(buffer.str().c_str());
6608
6609 file_stream.CloseElement();
6610 }
6611
6612 // Columns items
6613
6614 {
6615 const Index columns_number = get_columns_number();
6616
6617 for(Index i = 0; i < columns_number; i++)
6618 {
6619 file_stream.OpenElement("Column");
6620
6621 file_stream.PushAttribute("Item", to_string(i+1).c_str());
6622
6623 columns(i).write_XML(file_stream);
6624
6625 file_stream.CloseElement();
6626 }
6627 }
6628
6629 // Close columns
6630
6631 file_stream.CloseElement();
6632
6633 // Time series columns
6634
6635 const Index time_series_columns_number = get_time_series_columns_number();
6636
6637 if(time_series_columns_number != 0)
6638 {
6639 file_stream.OpenElement("TimeSeriesColumns");
6640
6641 // Time series columns number
6642 {
6643 file_stream.OpenElement("TimeSeriesColumnsNumber");
6644
6645 buffer.str("");
6647
6648 file_stream.PushText(buffer.str().c_str());
6649
6650 file_stream.CloseElement();
6651 }
6652
6653 // Time series columns items
6654
6655 for(Index i = 0; i < time_series_columns_number; i++)
6656 {
6657 file_stream.OpenElement("TimeSeriesColumn");
6658
6659 file_stream.PushAttribute("Item", to_string(i+1).c_str());
6660
6661 time_series_columns(i).write_XML(file_stream);
6662
6663 file_stream.CloseElement();
6664 }
6665
6666 // Close time series columns
6667
6668 file_stream.CloseElement();
6669 }
6670
6671 // Rows labels
6672
6673 if(has_rows_labels)
6674 {
6675 const Index rows_labels_number = rows_labels.dimension(0);
6676
6677 file_stream.OpenElement("RowsLabels");
6678
6679 buffer.str("");
6680
6681 for(Index i = 0; i < rows_labels_number; i++)
6682 {
6683 buffer << rows_labels(i);
6684
6685 if(i != rows_labels_number-1) buffer << ",";
6686 }
6687
6688 file_stream.PushText(buffer.str().c_str());
6689
6690 file_stream.CloseElement();
6691 }
6692
6693 // Samples
6694
6695 file_stream.OpenElement("Samples");
6696
6697 // Samples number
6698 {
6699 file_stream.OpenElement("SamplesNumber");
6700
6701 buffer.str("");
6702 buffer << get_samples_number();
6703
6704 file_stream.PushText(buffer.str().c_str());
6705
6706 file_stream.CloseElement();
6707 }
6708
6709 // Samples uses
6710
6711 {
6712 file_stream.OpenElement("SamplesUses");
6713
6714 buffer.str("");
6715
6716 const Index samples_number = get_samples_number();
6717
6718 for(Index i = 0; i < samples_number; i++)
6719 {
6720 SampleUse sample_use = samples_uses(i);
6721
6722 buffer << Index(sample_use);
6723
6724 if(i < (samples_number-1)) buffer << " ";
6725 }
6726
6727 file_stream.PushText(buffer.str().c_str());
6728
6729 file_stream.CloseElement();
6730 }
6731
6732 // Close samples
6733
6734 file_stream.CloseElement();
6735
6736 // Missing values
6737
6738 file_stream.OpenElement("MissingValues");
6739
6740 // Missing values method
6741
6742 {
6743 file_stream.OpenElement("MissingValuesMethod");
6744
6745 if(missing_values_method == MissingValuesMethod::Mean)
6746 {
6747 file_stream.PushText("Mean");
6748 }
6749 else if(missing_values_method == MissingValuesMethod::Median)
6750 {
6751 file_stream.PushText("Median");
6752 }
6753 else
6754 {
6755 file_stream.PushText("Unuse");
6756 }
6757
6758 file_stream.CloseElement();
6759 }
6760
6761 // Missing values number
6762
6763 {
6764 file_stream.OpenElement("MissingValuesNumber");
6765
6766 buffer.str("");
6767 buffer << missing_values_number;
6768
6769 file_stream.PushText(buffer.str().c_str());
6770
6771 file_stream.CloseElement();
6772 }
6773
6774 if(missing_values_number > 0)
6775 {
6776 // Columns missing values number
6777 {
6778 file_stream.OpenElement("ColumnsMissingValuesNumber");
6779
6780 const Index columns_number = columns_missing_values_number.size();
6781
6782 buffer.str("");
6783
6784 for(Index i = 0; i < columns_number; i++)
6785 {
6786 buffer << columns_missing_values_number(i);
6787
6788 if(i != (columns_number-1)) buffer << " ";
6789 }
6790
6791 file_stream.PushText(buffer.str().c_str());
6792
6793 file_stream.CloseElement();
6794 }
6795
6796 // Rows missing values number
6797 {
6798 file_stream.OpenElement("RowsMissingValuesNumber");
6799
6800 buffer.str("");
6801 buffer << rows_missing_values_number;
6802
6803 file_stream.PushText(buffer.str().c_str());
6804
6805 file_stream.CloseElement();
6806 }
6807 }
6808
6809 // Missing values
6810
6811 file_stream.CloseElement();
6812
6813 // Preview data
6814
6815 file_stream.OpenElement("PreviewData");
6816
6817 file_stream.OpenElement("PreviewSize");
6818
6819 buffer.str("");
6820 buffer << data_file_preview.size();
6821
6822 file_stream.PushText(buffer.str().c_str());
6823
6824 file_stream.CloseElement();
6825
6826 for(Index i = 0; i < data_file_preview.size(); i++)
6827 {
6828 file_stream.OpenElement("Row");
6829
6830 file_stream.PushAttribute("Item", to_string(i+1).c_str());
6831
6832 for(Index j = 0; j < data_file_preview(i).size(); j++)
6833 {
6834 file_stream.PushText(data_file_preview(i)(j).c_str());
6835
6836 if(j != data_file_preview(i).size()-1)
6837 {
6838 file_stream.PushText(",");
6839 }
6840 }
6841
6842 file_stream.CloseElement();
6843 }
6844
6845 // Close preview data
6846
6847 file_stream.CloseElement();
6848
6849 // Close data set
6850
6851 file_stream.CloseElement();
6852}
6853
6854
6855void DataSet::from_XML(const tinyxml2::XMLDocument& data_set_document)
6856{
6857 ostringstream buffer;
6858
6859 // Data set element
6860
6861 const tinyxml2::XMLElement* data_set_element = data_set_document.FirstChildElement("DataSet");
6862
6863 if(!data_set_element)
6864 {
6865 buffer << "OpenNN Exception: DataSet class.\n"
6866 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
6867 << "Data set element is nullptr.\n";
6868
6869 throw logic_error(buffer.str());
6870 }
6871
6872 // Data file
6873
6874 const tinyxml2::XMLElement* data_file_element = data_set_element->FirstChildElement("DataFile");
6875
6876 if(!data_file_element)
6877 {
6878 buffer << "OpenNN Exception: DataSet class.\n"
6879 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
6880 << "Data file element is nullptr.\n";
6881
6882 throw logic_error(buffer.str());
6883 }
6884
6885 // Data file name
6886
6887 const tinyxml2::XMLElement* data_file_name_element = data_file_element->FirstChildElement("DataFileName");
6888
6889 if(!data_file_name_element)
6890 {
6891 buffer << "OpenNN Exception: DataSet class.\n"
6892 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
6893 << "DataFileName element is nullptr.\n";
6894
6895 throw logic_error(buffer.str());
6896 }
6897
6898 if(data_file_name_element->GetText())
6899 {
6900 const string new_data_file_name = data_file_name_element->GetText();
6901
6902 set_data_file_name(new_data_file_name);
6903 }
6904
6905 // Separator
6906
6907 const tinyxml2::XMLElement* separator_element = data_file_element->FirstChildElement("Separator");
6908
6909 if(separator_element)
6910 {
6911 if(separator_element->GetText())
6912 {
6913 const string new_separator = separator_element->GetText();
6914
6915 set_separator(new_separator);
6916 }
6917 else
6918 {
6919 set_separator("Comma");
6920 }
6921 }
6922 else
6923 {
6924 set_separator("Comma");
6925 }
6926
6927 // Has columns names
6928
6929 const tinyxml2::XMLElement* columns_names_element = data_file_element->FirstChildElement("ColumnsNames");
6930
6931 if(columns_names_element)
6932 {
6933 const string new_columns_names_string = columns_names_element->GetText();
6934
6935 try
6936 {
6937 set_has_columns_names(new_columns_names_string == "1");
6938 }
6939 catch(const logic_error& e)
6940 {
6941 cerr << e.what() << endl;
6942 }
6943 }
6944
6945 // Rows labels
6946
6947 const tinyxml2::XMLElement* rows_label_element = data_file_element->FirstChildElement("RowsLabels");
6948
6949 if(rows_label_element)
6950 {
6951 const string new_rows_label_string = rows_label_element->GetText();
6952
6953 try
6954 {
6955 set_has_rows_label(new_rows_label_string == "1");
6956 }
6957 catch(const logic_error& e)
6958 {
6959 cerr << e.what() << endl;
6960 }
6961 }
6962
6963 // Missing values label
6964
6965 const tinyxml2::XMLElement* missing_values_label_element = data_file_element->FirstChildElement("MissingValuesLabel");
6966
6967 if(missing_values_label_element)
6968 {
6969 if(missing_values_label_element->GetText())
6970 {
6971 const string new_missing_values_label = missing_values_label_element->GetText();
6972
6973 set_missing_values_label(new_missing_values_label);
6974 }
6975 else
6976 {
6978 }
6979 }
6980 else
6981 {
6983 }
6984
6985 // Forecasting
6986
6987 // Lags number
6988
6989 const tinyxml2::XMLElement* lags_number_element = data_file_element->FirstChildElement("LagsNumber");
6990
6991 if(!lags_number_element)
6992 {
6993 buffer << "OpenNN Exception: DataSet class.\n"
6994 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
6995 << "Lags number element is nullptr.\n";
6996
6997 throw logic_error(buffer.str());
6998 }
6999
7000 if(lags_number_element->GetText())
7001 {
7002 const Index new_lags_number = static_cast<Index>(atoi(lags_number_element->GetText()));
7003
7004 set_lags_number(new_lags_number);
7005 }
7006
7007 // Steps ahead
7008
7009 const tinyxml2::XMLElement* steps_ahead_element = data_file_element->FirstChildElement("StepsAhead");
7010
7011 if(!steps_ahead_element)
7012 {
7013 buffer << "OpenNN Exception: DataSet class.\n"
7014 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7015 << "Steps ahead element is nullptr.\n";
7016
7017 throw logic_error(buffer.str());
7018 }
7019
7020 if(steps_ahead_element->GetText())
7021 {
7022 const Index new_steps_ahead = static_cast<Index>(atoi(steps_ahead_element->GetText()));
7023
7024 set_steps_ahead_number(new_steps_ahead);
7025 }
7026
7027 // Time column
7028
7029 const tinyxml2::XMLElement* time_column_element = data_file_element->FirstChildElement("TimeColumn");
7030
7031 if(!time_column_element)
7032 {
7033 buffer << "OpenNN Exception: DataSet class.\n"
7034 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7035 << "Time column element is nullptr.\n";
7036
7037 throw logic_error(buffer.str());
7038 }
7039
7040 if(time_column_element->GetText())
7041 {
7042 const string new_time_column = time_column_element->GetText();
7043
7044 set_time_column(new_time_column);
7045 }
7046
7047 // Columns
7048
7049 const tinyxml2::XMLElement* columns_element = data_set_element->FirstChildElement("Columns");
7050
7051 if(!columns_element)
7052 {
7053 buffer << "OpenNN Exception: DataSet class.\n"
7054 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7055 << "Columns element is nullptr.\n";
7056
7057 throw logic_error(buffer.str());
7058 }
7059
7060 // Columns number
7061
7062 const tinyxml2::XMLElement* columns_number_element = columns_element->FirstChildElement("ColumnsNumber");
7063
7064 if(!columns_number_element)
7065 {
7066 buffer << "OpenNN Exception: DataSet class.\n"
7067 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7068 << "Columns number element is nullptr.\n";
7069
7070 throw logic_error(buffer.str());
7071 }
7072
7073 Index new_columns_number = 0;
7074
7075 if(columns_number_element->GetText())
7076 {
7077 new_columns_number = static_cast<Index>(atoi(columns_number_element->GetText()));
7078
7079 set_columns_number(new_columns_number);
7080 }
7081
7082 // Columns
7083
7084 const tinyxml2::XMLElement* start_element = columns_number_element;
7085
7086 if(new_columns_number > 0)
7087 {
7088 for(Index i = 0; i < new_columns_number; i++)
7089 {
7090 const tinyxml2::XMLElement* column_element = start_element->NextSiblingElement("Column");
7091 start_element = column_element;
7092
7093 if(column_element->Attribute("Item") != to_string(i+1))
7094 {
7095 buffer << "OpenNN Exception: DataSet class.\n"
7096 << "void DataSet:from_XML(const tinyxml2::XMLDocument&) method.\n"
7097 << "Column item number (" << i+1 << ") does not match (" << column_element->Attribute("Item") << ").\n";
7098
7099 throw logic_error(buffer.str());
7100 }
7101
7102 // Name
7103
7104 const tinyxml2::XMLElement* name_element = column_element->FirstChildElement("Name");
7105
7106 if(!name_element)
7107 {
7108 buffer << "OpenNN Exception: DataSet class.\n"
7109 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7110 << "Name element is nullptr.\n";
7111
7112 throw logic_error(buffer.str());
7113 }
7114
7115 if(name_element->GetText())
7116 {
7117 const string new_name = name_element->GetText();
7118
7119 columns(i).name = new_name;
7120 }
7121
7122 // Scaler
7123
7124 const tinyxml2::XMLElement* scaler_element = column_element->FirstChildElement("Scaler");
7125
7126 if(!scaler_element)
7127 {
7128 buffer << "OpenNN Exception: DataSet class.\n"
7129 << "void DataSet::from_XML(const tinyxml2::XMLDocument&) method.\n"
7130 << "Scaler element is nullptr.\n";
7131
7132 throw logic_error(buffer.str());
7133 }
7134
7135 if(scaler_element->GetText())
7136 {
7137 const string new_scaler = scaler_element->GetText();
7138
7139 columns(i).set_scaler(new_scaler);
7140 }
7141
7142 // Column use
7143
7144 const tinyxml2::XMLElement* column_use_element = column_element->FirstChildElement("ColumnUse");
7145
7146 if(!column_use_element)
7147 {
7148 buffer << "OpenNN Exception: DataSet class.\n"
7149 << "void DataSet::from_XML(const tinyxml2::XMLDocument&) method.\n"
7150 << "Column use element is nullptr.\n";
7151
7152 throw logic_error(buffer.str());
7153 }
7154
7155 if(column_use_element->GetText())
7156 {
7157 const string new_column_use = column_use_element->GetText();
7158
7159 columns(i).set_use(new_column_use);
7160 }
7161
7162 // Type
7163
7164 const tinyxml2::XMLElement* type_element = column_element->FirstChildElement("Type");
7165
7166 if(!type_element)
7167 {
7168 buffer << "OpenNN Exception: DataSet class.\n"
7169 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7170 << "Type element is nullptr.\n";
7171
7172 throw logic_error(buffer.str());
7173 }
7174
7175 if(type_element->GetText())
7176 {
7177 const string new_type = type_element->GetText();
7178 columns(i).set_type(new_type);
7179 }
7180
7181 if(columns(i).type == ColumnType::Categorical || columns(i).type == ColumnType::Binary)
7182 {
7183 // Categories
7184
7185 const tinyxml2::XMLElement* categories_element = column_element->FirstChildElement("Categories");
7186
7187 if(!categories_element)
7188 {
7189 buffer << "OpenNN Exception: DataSet class.\n"
7190 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7191 << "Categories element is nullptr.\n";
7192
7193 throw logic_error(buffer.str());
7194 }
7195
7196 if(categories_element->GetText())
7197 {
7198 const string new_categories = categories_element->GetText();
7199
7200 columns(i).categories = get_tokens(new_categories, ';');
7201 }
7202
7203 // Categories uses
7204
7205 const tinyxml2::XMLElement* categories_uses_element = column_element->FirstChildElement("CategoriesUses");
7206
7207 if(!categories_uses_element)
7208 {
7209 buffer << "OpenNN Exception: DataSet class.\n"
7210 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7211 << "Categories uses element is nullptr.\n";
7212
7213 throw logic_error(buffer.str());
7214 }
7215
7216 if(categories_uses_element->GetText())
7217 {
7218 const string new_categories_uses = categories_uses_element->GetText();
7219
7220 columns(i).set_categories_uses(get_tokens(new_categories_uses, ';'));
7221 }
7222 }
7223 }
7224 }
7225
7226
7227 // Time series columns
7228
7229 const tinyxml2::XMLElement* time_series_columns_element = data_set_element->FirstChildElement("TimeSeriesColumns");
7230
7231 if(!time_series_columns_element)
7232 {
7233// buffer << "OpenNN Exception: DataSet class.\n"
7234// << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7235// << "Time series columns element is nullptr.\n";
7236
7237// throw logic_error(buffer.str());
7238
7239 // do nothing
7240 }
7241 else
7242 {
7243 // Time series columns number
7244
7245 const tinyxml2::XMLElement* time_series_columns_number_element = time_series_columns_element->FirstChildElement("TimeSeriesColumnsNumber");
7246
7247 if(!time_series_columns_number_element)
7248 {
7249 buffer << "OpenNN Exception: DataSet class.\n"
7250 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7251 << "Time seires columns number element is nullptr.\n";
7252
7253 throw logic_error(buffer.str());
7254 }
7255
7256 Index time_series_new_columns_number = 0;
7257
7258 if(time_series_columns_number_element->GetText())
7259 {
7260 time_series_new_columns_number = static_cast<Index>(atoi(time_series_columns_number_element->GetText()));
7261
7262 set_time_series_columns_number(time_series_new_columns_number);
7263 }
7264
7265 // Time series columns
7266
7267 const tinyxml2::XMLElement* time_series_start_element = time_series_columns_number_element;
7268
7269 if(time_series_new_columns_number > 0)
7270 {
7271 for(Index i = 0; i < time_series_new_columns_number; i++)
7272 {
7273 const tinyxml2::XMLElement* time_series_column_element = time_series_start_element->NextSiblingElement("TimeSeriesColumn");
7274 time_series_start_element = time_series_column_element;
7275
7276 if(time_series_column_element->Attribute("Item") != to_string(i+1))
7277 {
7278 buffer << "OpenNN Exception: DataSet class.\n"
7279 << "void DataSet:from_XML(const tinyxml2::XMLDocument&) method.\n"
7280 << "Time series column item number (" << i+1 << ") does not match (" << time_series_column_element->Attribute("Item") << ").\n";
7281
7282 throw logic_error(buffer.str());
7283 }
7284
7285 // Name
7286
7287 const tinyxml2::XMLElement* time_series_name_element = time_series_column_element->FirstChildElement("Name");
7288
7289 if(!time_series_name_element)
7290 {
7291 buffer << "OpenNN Exception: DataSet class.\n"
7292 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7293 << "Time series name element is nullptr.\n";
7294
7295 throw logic_error(buffer.str());
7296 }
7297
7298 if(time_series_name_element->GetText())
7299 {
7300 const string time_series_new_name = time_series_name_element->GetText();
7301
7302 time_series_columns(i).name = time_series_new_name;
7303 }
7304
7305 // Scaler
7306
7307 const tinyxml2::XMLElement* time_series_scaler_element = time_series_column_element->FirstChildElement("Scaler");
7308
7309 if(!time_series_scaler_element)
7310 {
7311 buffer << "OpenNN Exception: DataSet class.\n"
7312 << "void DataSet::from_XML(const tinyxml2::XMLDocument&) method.\n"
7313 << "Time series scaler element is nullptr.\n";
7314
7315 throw logic_error(buffer.str());
7316 }
7317
7318 if(time_series_scaler_element->GetText())
7319 {
7320 const string time_series_new_scaler = time_series_scaler_element->GetText();
7321
7322 time_series_columns(i).set_scaler(time_series_new_scaler);
7323 }
7324
7325 // Column use
7326
7327 const tinyxml2::XMLElement* time_series_column_use_element = time_series_column_element->FirstChildElement("ColumnUse");
7328
7329 if(!time_series_column_use_element)
7330 {
7331 buffer << "OpenNN Exception: DataSet class.\n"
7332 << "void DataSet::from_XML(const tinyxml2::XMLDocument&) method.\n"
7333 << "Time series column use element is nullptr.\n";
7334
7335 throw logic_error(buffer.str());
7336 }
7337
7338 if(time_series_column_use_element->GetText())
7339 {
7340 const string time_series_new_column_use = time_series_column_use_element->GetText();
7341
7342 time_series_columns(i).set_use(time_series_new_column_use);
7343 }
7344
7345 // Type
7346
7347 const tinyxml2::XMLElement* time_series_type_element = time_series_column_element->FirstChildElement("Type");
7348
7349 if(!time_series_type_element)
7350 {
7351 buffer << "OpenNN Exception: DataSet class.\n"
7352 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7353 << "Time series type element is nullptr.\n";
7354
7355 throw logic_error(buffer.str());
7356 }
7357
7358 if(time_series_type_element->GetText())
7359 {
7360 const string time_series_new_type = time_series_type_element->GetText();
7361 time_series_columns(i).set_type(time_series_new_type);
7362 }
7363
7364 if(time_series_columns(i).type == ColumnType::Categorical || time_series_columns(i).type == ColumnType::Binary)
7365 {
7366 // Categories
7367
7368 const tinyxml2::XMLElement* time_series_categories_element = time_series_column_element->FirstChildElement("Categories");
7369
7370 if(!time_series_categories_element)
7371 {
7372 buffer << "OpenNN Exception: DataSet class.\n"
7373 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7374 << "Time series categories element is nullptr.\n";
7375
7376 throw logic_error(buffer.str());
7377 }
7378
7379 if(time_series_categories_element->GetText())
7380 {
7381 const string time_series_new_categories = time_series_categories_element->GetText();
7382
7383 time_series_columns(i).categories = get_tokens(time_series_new_categories, ';');
7384 }
7385
7386 // Categories uses
7387
7388 const tinyxml2::XMLElement* time_series_categories_uses_element = time_series_column_element->FirstChildElement("CategoriesUses");
7389
7390 if(!time_series_categories_uses_element)
7391 {
7392 buffer << "OpenNN Exception: DataSet class.\n"
7393 << "void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7394 << "Time series categories uses element is nullptr.\n";
7395
7396 throw logic_error(buffer.str());
7397 }
7398
7399 if(time_series_categories_uses_element->GetText())
7400 {
7401 const string time_series_new_categories_uses = time_series_categories_uses_element->GetText();
7402
7403 time_series_columns(i).set_categories_uses(get_tokens(time_series_new_categories_uses, ';'));
7404 }
7405 }
7406 }
7407 }
7408 }
7409
7410 // Rows label
7411
7412 if(has_rows_labels)
7413 {
7414 // Rows labels begin tag
7415
7416 const tinyxml2::XMLElement* rows_labels_element = data_set_element->FirstChildElement("RowsLabels");
7417
7418 if(!rows_labels_element)
7419 {
7420 buffer << "OpenNN Exception: DataSet class.\n"
7421 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7422 << "Rows labels element is nullptr.\n";
7423
7424 throw logic_error(buffer.str());
7425 }
7426
7427 // Rows labels
7428
7429 if(rows_labels_element->GetText())
7430 {
7431 const string new_rows_labels = rows_labels_element->GetText();
7432
7433 rows_labels = get_tokens(new_rows_labels, ',');
7434 }
7435 }
7436
7437 // Samples
7438
7439 const tinyxml2::XMLElement* samples_element = data_set_element->FirstChildElement("Samples");
7440
7441 if(!samples_element)
7442 {
7443 buffer << "OpenNN Exception: DataSet class.\n"
7444 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7445 << "Samples element is nullptr.\n";
7446
7447 throw logic_error(buffer.str());
7448 }
7449
7450 // Samples number
7451
7452 const tinyxml2::XMLElement* samples_number_element = samples_element->FirstChildElement("SamplesNumber");
7453
7454 if(!samples_number_element)
7455 {
7456 buffer << "OpenNN Exception: DataSet class.\n"
7457 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7458 << "Samples number element is nullptr.\n";
7459
7460 throw logic_error(buffer.str());
7461 }
7462
7463 if(samples_number_element->GetText())
7464 {
7465 const Index new_samples_number = static_cast<Index>(atoi(samples_number_element->GetText()));
7466
7467 samples_uses.resize(new_samples_number);
7468 }
7469
7470 // Samples uses
7471
7472 const tinyxml2::XMLElement* samples_uses_element = samples_element->FirstChildElement("SamplesUses");
7473
7474 if(!samples_uses_element)
7475 {
7476 buffer << "OpenNN Exception: DataSet class.\n"
7477 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7478 << "Samples uses element is nullptr.\n";
7479
7480 throw logic_error(buffer.str());
7481 }
7482
7483 if(samples_uses_element->GetText())
7484 {
7485 set_samples_uses(get_tokens(samples_uses_element->GetText(), ' '));
7486 }
7487
7488 // Missing values
7489
7490 const tinyxml2::XMLElement* missing_values_element = data_set_element->FirstChildElement("MissingValues");
7491
7492 if(!missing_values_element)
7493 {
7494 buffer << "OpenNN Exception: DataSet class.\n"
7495 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7496 << "Missing values element is nullptr.\n";
7497
7498 throw logic_error(buffer.str());
7499 }
7500
7501 // Missing values method
7502
7503 const tinyxml2::XMLElement* missing_values_method_element = missing_values_element->FirstChildElement("MissingValuesMethod");
7504
7505 if(!missing_values_method_element)
7506 {
7507 buffer << "OpenNN Exception: DataSet class.\n"
7508 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7509 << "Missing values method element is nullptr.\n";
7510
7511 throw logic_error(buffer.str());
7512 }
7513
7514 if(missing_values_method_element->GetText())
7515 {
7516 set_missing_values_method(missing_values_method_element->GetText());
7517 }
7518
7519 // Missing values number
7520
7521 const tinyxml2::XMLElement* missing_values_number_element = missing_values_element->FirstChildElement("MissingValuesNumber");
7522
7523 if(!missing_values_number_element)
7524 {
7525 buffer << "OpenNN Exception: DataSet class.\n"
7526 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7527 << "Missing values number element is nullptr.\n";
7528
7529 throw logic_error(buffer.str());
7530 }
7531
7532 if(missing_values_number_element->GetText())
7533 {
7534 missing_values_number = static_cast<Index>(atoi(missing_values_number_element->GetText()));
7535 }
7536
7537 if(missing_values_number > 0)
7538 {
7539 // Columns Missing values number
7540
7541 const tinyxml2::XMLElement* columns_missing_values_number_element = missing_values_element->FirstChildElement("ColumnsMissingValuesNumber");
7542
7543 if(!columns_missing_values_number_element)
7544 {
7545 buffer << "OpenNN Exception: DataSet class.\n"
7546 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7547 << "Columns missing values number element is nullptr.\n";
7548
7549 throw logic_error(buffer.str());
7550 }
7551
7552 if(columns_missing_values_number_element->GetText())
7553 {
7554 Tensor<string, 1> new_columns_missing_values_number = get_tokens(columns_missing_values_number_element->GetText(), ' ');
7555
7556 columns_missing_values_number.resize(new_columns_missing_values_number.size());
7557
7558 for(Index i = 0; i < new_columns_missing_values_number.size(); i++)
7559 {
7560 columns_missing_values_number(i) = atoi(new_columns_missing_values_number(i).c_str());
7561 }
7562 }
7563
7564 // Rows missing values number
7565
7566 const tinyxml2::XMLElement* rows_missing_values_number_element = missing_values_element->FirstChildElement("RowsMissingValuesNumber");
7567
7568 if(!rows_missing_values_number_element)
7569 {
7570 buffer << "OpenNN Exception: DataSet class.\n"
7571 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7572 << "Rows missing values number element is nullptr.\n";
7573
7574 throw logic_error(buffer.str());
7575 }
7576
7577 if(rows_missing_values_number_element->GetText())
7578 {
7579 rows_missing_values_number = static_cast<Index>(atoi(rows_missing_values_number_element->GetText()));
7580 }
7581 }
7582
7583 // Preview data
7584
7585 const tinyxml2::XMLElement* preview_data_element = data_set_element->FirstChildElement("PreviewData");
7586
7587 if(!preview_data_element)
7588 {
7589 buffer << "OpenNN Exception: DataSet class.\n"
7590 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7591 << "Preview data element is nullptr.\n";
7592
7593 throw logic_error(buffer.str());
7594 }
7595
7596 // Preview size
7597
7598 const tinyxml2::XMLElement* preview_size_element = preview_data_element->FirstChildElement("PreviewSize");
7599
7600 if(!preview_size_element)
7601 {
7602 buffer << "OpenNN Exception: DataSet class.\n"
7603 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7604 << "Preview size element is nullptr.\n";
7605
7606 throw logic_error(buffer.str());
7607 }
7608
7609 Index new_preview_size = 0;
7610
7611 if(preview_size_element->GetText())
7612 {
7613 new_preview_size = static_cast<Index>(atoi(preview_size_element->GetText()));
7614
7615 if(new_preview_size > 0) data_file_preview.resize(new_preview_size);
7616 }
7617
7618 // Preview data
7619
7620 start_element = preview_size_element;
7621
7622 for(Index i = 0; i < new_preview_size; i++)
7623 {
7624 const tinyxml2::XMLElement* row_element = start_element->NextSiblingElement("Row");
7625 start_element = row_element;
7626
7627 if(row_element->Attribute("Item") != to_string(i+1))
7628 {
7629 buffer << "OpenNN Exception: DataSet class.\n"
7630 << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
7631 << "Row item number (" << i+1 << ") does not match (" << row_element->Attribute("Item") << ").\n";
7632
7633 throw logic_error(buffer.str());
7634 }
7635
7636 if(row_element->GetText())
7637 {
7638 data_file_preview(i) = get_tokens(row_element->GetText(), ',');
7639 }
7640 }
7641
7642 // Display
7643
7644 const tinyxml2::XMLElement* display_element = data_set_element->FirstChildElement("Display");
7645
7646 if(display_element)
7647 {
7648 const string new_display_string = display_element->GetText();
7649
7650 try
7651 {
7652 set_display(new_display_string != "0");
7653 }
7654 catch(const logic_error& e)
7655 {
7656 cerr << e.what() << endl;
7657 }
7658 }
7659}
7660
7661
7663
7664void DataSet::print() const
7665{
7666 if(display)
7667 {
7668 const Index variables_number = get_variables_number();
7669 const Index samples_number = get_samples_number();
7670
7671 cout << "Data set object summary:\n"
7672 << "Number of variables: " << variables_number << "\n"
7673 << "Number of samples: " << samples_number << "\n";
7674 }
7675}
7676
7677
7680
7681void DataSet::save(const string& file_name) const
7682{
7683 FILE* pFile = fopen(file_name.c_str(), "w");
7684
7685 tinyxml2::XMLPrinter document(pFile);
7686
7687 write_XML(document);
7688
7689 fclose(pFile);
7690}
7691
7692
7715
7716void DataSet::load(const string& file_name)
7717{
7718 tinyxml2::XMLDocument document;
7719
7720 if(document.LoadFile(file_name.c_str()))
7721 {
7722 ostringstream buffer;
7723
7724 buffer << "OpenNN Exception: DataSet class.\n"
7725 << "void load(const string&) method.\n"
7726 << "Cannot load XML file " << file_name << ".\n";
7727
7728 throw logic_error(buffer.str());
7729 }
7730
7731 from_XML(document);
7732}
7733
7734
7735void DataSet::print_columns() const
7736{
7737 const Index columns_number = get_columns_number();
7738
7739 for(Index i = 0; i < columns_number; i++)
7740 {
7741 columns(i).print();
7742 cout << endl;
7743 }
7744
7745 cout << endl;
7746
7747}
7748
7749void DataSet::print_columns_types() const
7750{
7751 const Index columns_number = get_columns_number();
7752
7753 for(Index i = 0; i < columns_number; i++)
7754 {
7755 if(columns(i).type == ColumnType::Numeric) cout << "Numeric ";
7756 else if(columns(i).type == ColumnType::Binary) cout << "Binary ";
7757 else if(columns(i).type == ColumnType::Categorical) cout << "Categorical ";
7758 else if(columns(i).type == ColumnType::DateTime) cout << "DateTime ";
7759 else if(columns(i).type == ColumnType::Constant) cout << "Constant ";
7760 }
7761
7762 cout << endl;
7763}
7764
7765
7766void DataSet::print_columns_uses() const
7767{
7768 const Index columns_number = get_columns_number();
7769
7770 for(Index i = 0; i < columns_number; i++)
7771 {
7772 if(columns(i).column_use == VariableUse::Input) cout << "Input ";
7773 else if(columns(i).column_use == VariableUse::Target) cout << "Target ";
7774 else if(columns(i).column_use == VariableUse::UnusedVariable) cout << "Unused ";
7775 }
7776
7777 cout << endl;
7778}
7779
7780
7782
7784{
7785 if(display) cout << data << endl;
7786}
7787
7788
7791
7793{
7794 if(!display) return;
7795
7796 const Index samples_number = get_samples_number();
7797
7798 if(samples_number > 0)
7799 {
7800 const Tensor<type, 1> first_sample = data.chip(0, 0);
7801
7802 cout << "First sample: \n";
7803
7804 for(int i = 0; i< first_sample.dimension(0); i++)
7805 {
7806 cout << first_sample(i) << " ";
7807 }
7808
7809 cout << endl;
7810 }
7811
7812 if(samples_number > 1)
7813 {
7814 const Tensor<type, 1> second_sample = data.chip(1, 0);
7815
7816 cout << "Second sample: \n";
7817
7818 for(int i = 0; i< second_sample.dimension(0); i++)
7819 {
7820 cout << second_sample(i) << " ";
7821 }
7822
7823 cout << endl;
7824 }
7825
7826 if(samples_number > 2)
7827 {
7828 const Tensor<type, 1> last_sample = data.chip(samples_number-1, 0);
7829
7830 cout << "Last sample: \n";
7831
7832 for(int i = 0; i< last_sample.dimension(0); i++)
7833 {
7834 cout << last_sample(i) << " ";
7835 }
7836
7837 cout << endl;
7838 }
7839}
7840
7841
7843
7845{
7846 ofstream file(data_file_name.c_str());
7847
7848 if(!file.is_open())
7849 {
7850 ostringstream buffer;
7851
7852 buffer << "OpenNN Exception: Matrix template." << endl
7853 << "void save_csv(const string&, const char&, const Vector<string>&, const Vector<string>&) method." << endl
7854 << "Cannot open matrix data file: " << data_file_name << endl;
7855
7856 throw logic_error(buffer.str());
7857 }
7858
7859 file.precision(20);
7860
7861 const Index samples_number = get_samples_number();
7862 const Index variables_number = get_variables_number();
7863
7864 const Tensor<string, 1> variables_names = get_variables_names();
7865
7866 char separator_char = ',';//get_separator_char();
7867
7868 if(this->has_rows_labels)
7869 {
7870 file << "id" << separator_char;
7871 }
7872 for(Index j = 0; j < variables_number; j++)
7873 {
7874 file << variables_names[j];
7875
7876 if(j != variables_number-1)
7877 {
7878 file << separator_char;
7879 }
7880 }
7881
7882 file << endl;
7883
7884 for(Index i = 0; i < samples_number; i++)
7885 {
7886 if(this->has_rows_labels)
7887 {
7888 file << rows_labels(i) << separator_char;
7889 }
7890 for(Index j = 0; j < variables_number; j++)
7891 {
7892 file << data(i,j);
7893
7894 if(j != variables_number-1)
7895 {
7896 file << separator_char;
7897 }
7898 }
7899
7900 file << endl;
7901 }
7902
7903 file.close();
7904}
7905
7906
7908
7909void DataSet::save_data_binary(const string& binary_data_file_name) const
7910{
7911 ofstream file(binary_data_file_name.c_str(), ios::binary);
7912
7913 if(!file.is_open())
7914 {
7915 ostringstream buffer;
7916
7917 buffer << "OpenNN Exception: DataSet class." << endl
7918 << "void save_data_binary() method." << endl
7919 << "Cannot open data binary file." << endl;
7920
7921 throw logic_error(buffer.str());
7922 }
7923
7924 // Write data
7925
7926 streamsize size = sizeof(Index);
7927
7928 Index columns_number = data.dimension(1);
7929 Index rows_number = data.dimension(0);
7930
7931 cout << "Saving binary data file..." << endl;
7932
7933 file.write(reinterpret_cast<char*>(&columns_number), size);
7934 file.write(reinterpret_cast<char*>(&rows_number), size);
7935
7936 size = sizeof(type);
7937
7938 type value;
7939
7940 for(int i = 0; i < columns_number; i++)
7941 {
7942 for(int j = 0; j < rows_number; j++)
7943 {
7944 value = data(j,i);
7945
7946 file.write(reinterpret_cast<char*>(&value), size);
7947 }
7948 }
7949
7950 file.close();
7951
7952 cout << "Binary data file saved." << endl;
7953}
7954
7955
7957
7958void DataSet::save_time_series_data_binary(const string& binary_data_file_name) const
7959{
7960 ofstream file(binary_data_file_name.c_str(), ios::binary);
7961
7962 if(!file.is_open())
7963 {
7964 ostringstream buffer;
7965
7966 buffer << "OpenNN Exception: DataSet class." << endl
7967 << "void save_time_series_data_binary(const string) method." << endl
7968 << "Cannot open data binary file." << endl;
7969
7970 throw logic_error(buffer.str());
7971 }
7972
7973 // Write data
7974
7975 streamsize size = sizeof(Index);
7976
7977 Index columns_number = time_series_data.dimension(1);
7978 Index rows_number = time_series_data.dimension(0);
7979
7980 cout << "Saving binary data file..." << endl;
7981
7982 file.write(reinterpret_cast<char*>(&columns_number), size);
7983 file.write(reinterpret_cast<char*>(&rows_number), size);
7984
7985 size = sizeof(type);
7986
7987 type value;
7988
7989 for(int i = 0; i < columns_number; i++)
7990 {
7991 for(int j = 0; j < rows_number; j++)
7992 {
7993 value = time_series_data(j,i);
7994
7995 file.write(reinterpret_cast<char*>(&value), size);
7996 }
7997 }
7998
7999 file.close();
8000
8001 cout << "Binary data file saved." << endl;
8002}
8003
8004
8006
8008{
8009 if(lags_number == 0 || steps_ahead == 0) return;
8010
8011 transform_time_series_data();
8012
8014
8016
8018}
8019
8020
8022
8024{
8025 ifstream file;
8026
8027 file.open(data_file_name.c_str(), ios::binary);
8028
8029 if(!file.is_open())
8030 {
8031 ostringstream buffer;
8032
8033 buffer << "OpenNN Exception: DataSet class.\n"
8034 << "void load_binary() method.\n"
8035 << "Cannot open binary file: " << data_file_name << "\n";
8036
8037 throw logic_error(buffer.str());
8038 }
8039
8040 streamsize size = sizeof(Index);
8041
8042 Index columns_number;
8043 Index rows_number;
8044
8045 file.read(reinterpret_cast<char*>(&columns_number), size);
8046 file.read(reinterpret_cast<char*>(&rows_number), size);
8047
8048 size = sizeof(type);
8049
8050 type value;
8051
8052 data.resize(rows_number, columns_number);
8053
8054 for(Index i = 0; i < rows_number*columns_number; i++)
8055 {
8056 file.read(reinterpret_cast<char*>(&value), size);
8057
8058 data(i) = value;
8059 }
8060
8061 file.close();
8062}
8063
8064
8066
8067void DataSet::load_time_series_data_binary(const string& time_series_data_file_name)
8068{
8069 ifstream file;
8070
8071 file.open(time_series_data_file_name.c_str(), ios::binary);
8072
8073 if(!file.is_open())
8074 {
8075 ostringstream buffer;
8076
8077 buffer << "OpenNN Exception: DataSet class.\n"
8078 << "void load_time_series_data_binary(const string&) method.\n"
8079 << "Cannot open binary file: " << time_series_data_file_name << "\n";
8080
8081 throw logic_error(buffer.str());
8082 }
8083
8084 streamsize size = sizeof(Index);
8085
8086 Index columns_number;
8087 Index rows_number;
8088
8089 file.read(reinterpret_cast<char*>(&columns_number), size);
8090 file.read(reinterpret_cast<char*>(&rows_number), size);
8091
8092 size = sizeof(type);
8093
8094 type value;
8095
8096 time_series_data.resize(rows_number, columns_number);
8097
8098 for(Index i = 0; i < rows_number*columns_number; i++)
8099 {
8100 file.read(reinterpret_cast<char*>(&value), size);
8101
8102 time_series_data(i) = value;
8103 }
8104
8105 file.close();
8106}
8107
8108
8110
8111void DataSet::check_input_csv(const string & input_data_file_name, const char & separator_char) const
8112{
8113 ifstream file(input_data_file_name.c_str());
8114
8115 if(!file.is_open())
8116 {
8117 ostringstream buffer;
8118
8119 buffer << "OpenNN Exception: DataSet class.\n"
8120 << "void check_input_csv() method.\n"
8121 << "Cannot open input data file: " << input_data_file_name << "\n";
8122
8123 throw logic_error(buffer.str());
8124 }
8125
8126 string line;
8127 Index line_number = 0;
8128 Index total_lines = 0;
8129
8130 Index tokens_count;
8131
8132 const Index columns_number = get_columns_number() - get_target_columns_number();
8133
8134 while(file.good())
8135 {
8136 line_number++;
8137
8138 getline(file, line);
8139
8140 trim(line);
8141
8142 erase(line, '"');
8143
8144 if(line.empty()) continue;
8145
8146 total_lines++;
8147
8148 tokens_count = count_tokens(line, separator_char);
8149
8150 if(tokens_count != columns_number)
8151 {
8152 ostringstream buffer;
8153
8154 buffer << "OpenNN Exception: DataSet class.\n"
8155 << "void check_input_csv() method.\n"
8156 << "Line " << line_number << ": Size of tokens in input file ("
8157 << tokens_count << ") is not equal to number of columns("
8158 << columns_number << "). \n"
8159 << "Input csv must contain values for all the variables except the target. \n";
8160
8161 throw logic_error(buffer.str());
8162 }
8163 }
8164
8165 file.close();
8166
8167 if(total_lines == 0)
8168 {
8169 ostringstream buffer;
8170
8171 buffer << "OpenNN Exception: DataSet class.\n"
8172 << "void check_input_csv() method.\n"
8173 << "Input data file is empty. \n";
8174
8175 throw logic_error(buffer.str());
8176 }
8177}
8178
8179
8181
8182Tensor<type, 2> DataSet::read_input_csv(const string& input_data_file_name,
8183 const char& separator_char,
8184 const string& missing_values_label,
8185 const bool& has_columns_name,
8186 const bool& has_rows_label) const
8187{
8188 ifstream file(input_data_file_name.c_str());
8189
8190 if(!file.is_open())
8191 {
8192 ostringstream buffer;
8193
8194 buffer << "OpenNN Exception: DataSet class.\n"
8195 << "void read_input_csv() method.\n"
8196 << "Cannot open input data file: " << input_data_file_name << "\n";
8197
8198 throw logic_error(buffer.str());
8199 }
8200
8201 // Count samples number
8202
8203 Index input_samples_count = 0;
8204
8205 string line;
8206 Index line_number = 0;
8207
8208 Index tokens_count;
8209
8210 const Index columns_number = get_columns_number() - get_target_columns_number();
8211
8212 while(file.good())
8213 {
8214 line_number++;
8215
8216 getline(file, line);
8217
8218 trim(line);
8219
8220 erase(line, '"');
8221
8222 if(line.empty()) continue;
8223
8224 tokens_count = count_tokens(line, separator_char);
8225
8226 if(tokens_count != columns_number)
8227 {
8228 ostringstream buffer;
8229
8230 buffer << "OpenNN Exception: DataSet class.\n"
8231 << "void read_input_csv() method.\n"
8232 << "Line " << line_number << ": Size of tokens("
8233 << tokens_count << ") is not equal to number of columns("
8234 << columns_number << ").\n";
8235
8236 throw logic_error(buffer.str());
8237 }
8238
8239 input_samples_count++;
8240 }
8241
8242 file.close();
8243
8244 Index variables_number = get_input_variables_number();
8245
8246 if(has_columns_name) input_samples_count--;
8247
8248 Tensor<type, 2> input_data(input_samples_count, variables_number);
8249
8250 // Fill input data
8251
8252 file.open(input_data_file_name.c_str());
8253
8254 if(!file.is_open())
8255 {
8256 ostringstream buffer;
8257
8258 buffer << "OpenNN Exception: DataSet class.\n"
8259 << "void read_input_csv() method.\n"
8260 << "Cannot open input data file: " << input_data_file_name << " for filling input data file. \n";
8261
8262 throw logic_error(buffer.str());
8263 }
8264
8265 // Read first line
8266
8267 if(has_columns_name)
8268 {
8269 while(file.good())
8270 {
8271 getline(file, line);
8272
8273 if(line.empty()) continue;
8274
8275 break;
8276 }
8277 }
8278
8279 // Read rest of the lines
8280
8281 Tensor<string, 1> tokens;
8282
8283 line_number = 0;
8284 Index variable_index = 0;
8285 Index token_index = 0;
8286 bool is_ID = has_rows_label;
8287
8288 const bool is_float = is_same<type, float>::value;
8289 bool has_missing_values = false;
8290
8291 while(file.good())
8292 {
8293 getline(file, line);
8294
8295 trim(line);
8296
8297 erase(line, '"');
8298
8299 if(line.empty()) continue;
8300
8301 tokens = get_tokens(line, separator_char);
8302
8303 variable_index = 0;
8304 token_index = 0;
8305 is_ID = has_rows_label;
8306
8307 for(Index i = 0; i < columns.size(); i++)
8308 {
8309 if(is_ID)
8310 {
8311 is_ID = false;
8312 continue;
8313 }
8314
8315 if(columns(i).column_use == VariableUse::UnusedVariable)
8316 {
8317 token_index++;
8318 continue;
8319 }
8320 else if(columns(i).column_use != VariableUse::Input)
8321 {
8322 continue;
8323 }
8324
8325 if(columns(i).type == ColumnType::Numeric)
8326 {
8327 if(tokens(token_index) == missing_values_label || tokens(token_index).empty())
8328 {
8329 has_missing_values = true;
8330 input_data(line_number, variable_index) = static_cast<type>(NAN);
8331 }
8332 else if(is_float)
8333 {
8334 input_data(line_number, variable_index) = type(strtof(tokens(token_index).data(), NULL));
8335 }
8336 else
8337 {
8338 input_data(line_number, variable_index) = type(stof(tokens(token_index)));
8339 }
8340
8341 variable_index++;
8342 }
8343 else if(columns(i).type == ColumnType::Binary)
8344 {
8345 if(tokens(token_index) == missing_values_label)
8346 {
8347 has_missing_values = true;
8348 input_data(line_number, variable_index) = static_cast<type>(NAN);
8349 }
8350 else if(columns(i).categories.size() > 0 && tokens(token_index) == columns(i).categories(0))
8351 {
8352 input_data(line_number, variable_index) = type(1);
8353 }
8354 else if(tokens(token_index) == columns(i).name)
8355 {
8356 input_data(line_number, variable_index) = type(1);
8357 }
8358
8359 variable_index++;
8360 }
8361 else if(columns(i).type == ColumnType::Categorical)
8362 {
8363 for(Index k = 0; k < columns(i).get_categories_number(); k++)
8364 {
8365 if(tokens(token_index) == missing_values_label)
8366 {
8367 has_missing_values = true;
8368 input_data(line_number, variable_index) = static_cast<type>(NAN);
8369 }
8370 else if(tokens(token_index) == columns(i).categories(k))
8371 {
8372 input_data(line_number, variable_index) = type(1);
8373 }
8374
8375 variable_index++;
8376 }
8377 }
8378 else if(columns(i).type == ColumnType::DateTime)
8379 {
8380 if(tokens(token_index) == missing_values_label || tokens(token_index).empty())
8381 {
8382 has_missing_values = true;
8383 input_data(line_number, variable_index) = static_cast<type>(NAN);
8384 }
8385 else
8386 {
8387 input_data(line_number, variable_index) = static_cast<type>(date_to_timestamp(tokens(token_index), gmt));
8388 }
8389
8390 variable_index++;
8391 }
8392 else if(columns(i).type == ColumnType::Constant)
8393 {
8394 if(tokens(token_index) == missing_values_label || tokens(token_index).empty())
8395 {
8396 has_missing_values = true;
8397 input_data(line_number, variable_index) = static_cast<type>(NAN);
8398 }
8399 else if(is_float)
8400 {
8401 input_data(line_number, variable_index) = type(strtof(tokens(token_index).data(), NULL));
8402 }
8403 else
8404 {
8405 input_data(line_number, variable_index) = type(stof(tokens(token_index)));
8406 }
8407
8408 variable_index++;
8409 }
8410
8411 token_index++;
8412 }
8413
8414 line_number++;
8415 }
8416
8417 file.close();
8418
8419 if(!has_missing_values)
8420 {
8421 return input_data;
8422 }
8423 else
8424 {
8425 // Scrub missing values
8426
8428
8429 if(missing_values_method == MissingValuesMethod::Unuse || missing_values_method == MissingValuesMethod::Mean)
8430 {
8431 const Tensor<type, 1> means = mean(input_data);
8432
8433 const Index samples_number = input_data.dimension(0);
8434 const Index variables_number = input_data.dimension(1);
8435
8436 #pragma omp parallel for schedule(dynamic)
8437
8438 for(Index j = 0; j < variables_number; j++)
8439 {
8440 for(Index i = 0; i < samples_number; i++)
8441 {
8442 if(isnan(input_data(i, j)))
8443 {
8444 input_data(i,j) = means(j);
8445 }
8446 }
8447 }
8448 }
8449 else
8450 {
8451 const Tensor<type, 1> medians = median(input_data);
8452
8453 const Index samples_number = input_data.dimension(0);
8454 const Index variables_number = input_data.dimension(1);
8455
8456 #pragma omp parallel for schedule(dynamic)
8457
8458 for(Index j = 0; j < variables_number; j++)
8459 {
8460 for(Index i = 0; i < samples_number; i++)
8461 {
8462 if(isnan(input_data(i, j)))
8463 {
8464 input_data(i,j) = medians(j);
8465 }
8466 }
8467 }
8468 }
8469
8470 return input_data;
8471 }
8472}
8473
8474
8479
8481{
8482 const Index samples_number = get_samples_number();
8483 const Index targets_number = get_target_variables_number();
8484 const Tensor<Index, 1> target_variables_indices = get_target_variables_indices();
8485
8486 Tensor<Index, 1> class_distribution;
8487
8488 if(targets_number == 1) // Two classes
8489 {
8490 class_distribution = Tensor<Index, 1>(2);
8491
8492 Index target_index = target_variables_indices(0);
8493
8494 Index positives = 0;
8495 Index negatives = 0;
8496
8497 for(Index sample_index = 0; sample_index < static_cast<Index>(samples_number); sample_index++)
8498 {
8499 if(!isnan(data(static_cast<Index>(sample_index),target_index)))
8500 {
8501 if(data(static_cast<Index>(sample_index), target_index) < static_cast<type>(0.5))
8502 {
8503 negatives++;
8504 }
8505 else
8506 {
8507 positives++;
8508 }
8509 }
8510 }
8511
8512 class_distribution(0) = negatives;
8513 class_distribution(1) = positives;
8514 }
8515 else // More than two classes
8516 {
8517 class_distribution = Tensor<Index, 1>(targets_number);
8518
8519 for(Index i = 0; i < samples_number; i++)
8520 {
8521 if(get_sample_use(i) != SampleUse::UnusedSample)
8522 {
8523 for(Index j = 0; j < targets_number; j++)
8524 {
8525 if(data(i,target_variables_indices(j)) == static_cast<type>(NAN)) continue;
8526
8527 if(data(i,target_variables_indices(j)) > type(0.5)) class_distribution(j)++;
8528 }
8529 }
8530 }
8531 }
8532
8533 return class_distribution;
8534}
8535
8536
8539
8540Tensor<Tensor<Index, 1>, 1> DataSet::calculate_Tukey_outliers(const type& cleaning_parameter) const
8541{
8542 const Index samples_number = get_used_samples_number();
8543 const Tensor<Index, 1> samples_indices = get_used_samples_indices();
8544
8545 const Index columns_number = get_columns_number();
8546 const Index used_columns_number = get_used_columns_number();
8547 const Tensor<Index, 1> used_columns_indices = get_used_columns_indices();
8548
8549 Tensor<Tensor<Index, 1>, 1> return_values(2);
8550
8551 return_values(0) = Tensor<Index, 1>(samples_number);
8552 return_values(1) = Tensor<Index, 1>(used_columns_number);
8553
8554 return_values(0).setZero();
8555 return_values(1).setZero();
8556
8557 Tensor<BoxPlot, 1> box_plots = calculate_columns_box_plots();
8558
8559 Index used_column_index = 0;
8560 Index variable_index = 0;
8561
8562 #pragma omp parallel for
8563
8564 for(Index i = 0; i < columns_number; i++)
8565 {
8566 if(columns(i).column_use == VariableUse::UnusedVariable && columns(i).type == ColumnType::Categorical)
8567 {
8568 variable_index += columns(i).get_categories_number();
8569 continue;
8570 }
8571 else if(columns(i).column_use == VariableUse::UnusedVariable) // Numeric, Binary or DateTime
8572 {
8573 variable_index++;
8574 continue;
8575 }
8576
8577 if(columns(i).type == ColumnType::Categorical || columns(i).type == ColumnType::Binary || columns(i).type == ColumnType::DateTime)
8578 {
8579 used_column_index++;
8580 columns(i).get_categories_number() == 0 ? variable_index++ : variable_index += columns(i).get_categories_number();
8581 continue;
8582 }
8583 else // Numeric
8584 {
8585 const type interquartile_range = box_plots(used_column_index).third_quartile - box_plots(used_column_index).first_quartile;
8586
8587 if(interquartile_range < numeric_limits<type>::epsilon())
8588 {
8589 used_column_index++;
8590 variable_index++;
8591 continue;
8592 }
8593
8594 Index columns_outliers = 0;
8595
8596 for(Index j = 0; j < samples_number; j++)
8597 {
8598 const Tensor<type, 1> sample = get_sample_data(samples_indices(static_cast<Index>(j)));
8599
8600 if(sample(variable_index) <(box_plots(used_column_index).first_quartile - cleaning_parameter*interquartile_range) ||
8601 sample(variable_index) >(box_plots(used_column_index).third_quartile + cleaning_parameter*interquartile_range))
8602 {
8603 return_values(0)(static_cast<Index>(j)) = 1;
8604
8605 columns_outliers++;
8606 }
8607 }
8608
8609 return_values(1)(used_column_index) = columns_outliers;
8610
8611 used_column_index++;
8612 variable_index++;
8613 }
8614 }
8615
8616 return return_values;
8617}
8618
8619
8623
8624void DataSet::unuse_Tukey_outliers(const type& cleaning_parameter)
8625{
8626 const Tensor<Tensor<Index, 1>, 1> outliers_indices = calculate_Tukey_outliers(cleaning_parameter);
8627
8628// const Tensor<Index, 1> outliers_samples = outliers_indices(0).get_indices_greater_than(0);
8629
8630// set_samples_unused(outliers_samples);
8631}
8632
8633
8634Tensor<Index, 1> DataSet::select_outliers_via_contamination(const Tensor<type, 1>& outlier_ranks,
8635 const type & contamination,
8636 bool higher) const
8637{
8638 const Index samples_number = get_used_samples_number();
8639
8640 Tensor<Tensor<type, 1>, 1> ordered_ranks(samples_number);
8641
8642 Tensor<Index, 1> outlier_indexes(samples_number);
8643 outlier_indexes.setZero();
8644
8645 for(Index i = 0; i < samples_number; i++)
8646 {
8647 ordered_ranks(i) = Tensor<type, 1>(2);
8648 ordered_ranks(i)(0) = type(i);
8649 ordered_ranks(i)(1) = outlier_ranks(i);
8650 }
8651
8652 sort(ordered_ranks.data(), ordered_ranks.data() + samples_number,
8653 [](Tensor<type, 1> & a, Tensor<type, 1> & b) -> bool
8654 {
8655 return a(1) < b(1);
8656 });
8657
8658 if(higher)
8659 {
8660 for(Index i = Index((type(1) - contamination)*type(samples_number)); i < samples_number; i++)
8661 outlier_indexes(static_cast<Index>(ordered_ranks(i)(0))) = 1;
8662 }
8663 else
8664 {
8665 for(Index i = 0; i < Index(contamination*type(samples_number)); i++)
8666 outlier_indexes(static_cast<Index>(ordered_ranks(i)(0))) = 1;
8667 }
8668
8669 return outlier_indexes;
8670}
8671
8672
8673Tensor<Index, 1> DataSet::select_outliers_via_standard_deviation(const Tensor<type, 1>& outlier_ranks,
8674 const type & deviation_factor,
8675 bool higher) const
8676{
8677 const Index samples_number = get_used_samples_number();
8678 const type mean_ranks = mean(outlier_ranks);
8679 const type std_ranks = standard_deviation(outlier_ranks);
8680
8681 Tensor<Index, 1> outlier_indexes(samples_number);
8682 outlier_indexes.setZero();
8683
8684
8685 if(higher)
8686 {
8687 for(Index i = 0; i < samples_number; i++)
8688 {
8689 if(outlier_ranks(i) > mean_ranks + deviation_factor*std_ranks)
8690 outlier_indexes(i) = 1;
8691 }
8692 }
8693 else
8694 {
8695 for(Index i = 0; i < samples_number; i++)
8696 {
8697 if(outlier_ranks(i) < mean_ranks - deviation_factor*std_ranks)
8698 outlier_indexes(i) = 1;
8699 }
8700 }
8701
8702
8703 return outlier_indexes;
8704}
8705
8706
8707type DataSet::calculate_euclidean_distance(const Tensor<Index, 1>& variables_indices,
8708 const Index& sample_index,
8709 const Index& other_sample_index) const
8710{
8711 const Index input_variables_number = variables_indices.size();
8712
8713 type distance = type(0);
8714 type error;
8715
8716 for(Index i = 0; i < input_variables_number; i++)
8717 {
8718 error = data(sample_index, variables_indices(i)) - data(other_sample_index, variables_indices(i));
8719
8720 distance += error*error;
8721 }
8722
8723 return sqrt(distance);
8724}
8725
8726
8727Tensor<type, 2> DataSet::calculate_distance_matrix(const Tensor<Index,1>& indices)const
8728{
8729 const Index samples_number = indices.size();
8730
8731 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
8732
8733 Tensor<type, 2> distance_matrix(samples_number, samples_number);
8734 distance_matrix.setZero();
8735
8736 #pragma omp parallel for
8737
8738 for(Index i = 0; i < samples_number ; i++)
8739 {
8740 for(Index k = 0; k < i; k++)
8741 {
8742 distance_matrix(i,k)
8743 = distance_matrix(k,i)
8744 = calculate_euclidean_distance(input_variables_indices, indices(i), indices(k));
8745 }
8746 }
8747 return distance_matrix;
8748}
8749
8750
8751Tensor<list<Index>, 1> DataSet::calculate_k_nearest_neighbors(const Tensor<type, 2>& distance_matrix, const Index& k_neighbors) const
8752{
8753 const Index samples_number = distance_matrix.dimensions()[0];
8754
8755 Tensor<list<Index>, 1> neighbors_indices(samples_number);
8756
8757 #pragma omp parallel for
8758
8759 for(Index i = 0; i < samples_number; i++)
8760 {
8761 list<type> min_distances(k_neighbors, numeric_limits<type>::max());
8762
8763 neighbors_indices(i) = list<Index>(k_neighbors, 0);
8764
8765 for(Index j = 0; j < samples_number; j++)
8766 {
8767 if(j == i) continue;
8768
8769 list<Index>::iterator neighbor_it = neighbors_indices(i).begin();
8770 list<type>::iterator current_min = min_distances.begin();
8771
8772 for(Index k = 0; k < k_neighbors; k++, current_min++, neighbor_it++)
8773 {
8774 if(distance_matrix(i,j) < *current_min)
8775 {
8776 neighbors_indices(i).insert(neighbor_it, j);
8777
8778 min_distances.insert(current_min, distance_matrix(i,j));
8779
8780 break;
8781 }
8782 }
8783 }
8784
8785 neighbors_indices(i).resize(k_neighbors);
8786 }
8787
8788 return neighbors_indices;
8789}
8790
8791
8792Tensor<Tensor<type, 1>, 1> DataSet::get_kd_tree_data() const
8793{
8794 const Index used_samples_number = get_used_samples_number();
8795 const Index input_variables_number = get_input_variables_number();
8796
8797 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
8798 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
8799
8800 Tensor<Tensor<type, 1>, 1> kd_tree_data(used_samples_number);
8801
8802 for(Index i = 0; i < used_samples_number; i++)
8803 {
8804 kd_tree_data(i) = Tensor<type, 1>(input_variables_number+1);
8805
8806 kd_tree_data(i)(0) = type(used_samples_indices(i)); // Storing index
8807
8808 for(Index j = 0; j < input_variables_number; j++)
8809 kd_tree_data(i)(j+1) = data(used_samples_indices(i), input_variables_indices(j));
8810 }
8811
8812 return kd_tree_data;
8813}
8814
8815
8816Tensor<Tensor<Index, 1>, 1> DataSet::create_bounding_limits_kd_tree(const Index& depth) const
8817{
8818
8819 Tensor<Tensor<Index, 1>, 1> bounding_limits(depth+1);
8820
8821 bounding_limits(0) = Tensor<Index, 1>(2);
8822 bounding_limits(0)(0) = 0;
8823 bounding_limits(0)(1) = get_used_samples_number();
8824
8825
8826 for(Index i = 1; i <= depth; i++)
8827 {
8828 bounding_limits(i) = Tensor<Index, 1>(pow(2, i)+1);
8829 bounding_limits(i)(0) = 0;
8830
8831 for(Index j = 1; j < bounding_limits(i).size()-1; j = j+2)
8832 {
8833 bounding_limits(i)(j) = (bounding_limits(i-1)(j/2+1) - bounding_limits(i-1)(j/2))/2
8834 + bounding_limits(i-1)(j/2);
8835
8836 bounding_limits(i)(j+1) = bounding_limits(i-1)(j/2+1);
8837 }
8838 }
8839 return bounding_limits;
8840}
8841
8842
8843void DataSet::create_kd_tree(Tensor<Tensor<type, 1>, 1>& tree, const Tensor<Tensor<Index, 1>, 1>& bounding_limits) const
8844{
8845 const Index depth = bounding_limits.size()-1;
8846 const Index input_variables = tree(0).size();
8847
8848 auto specific_sort = [&tree](const Index & first, const Index & last, const Index & split_variable)
8849 {
8850 sort(tree.data() + first, tree.data() + last,
8851 [&split_variable](const Tensor<type, 1> & a, const Tensor<type, 1> & b) -> bool
8852 {
8853 return a(split_variable) > b(split_variable);
8854 });
8855 };
8856
8857 specific_sort(bounding_limits(0)(0), bounding_limits(0)(1), 1);
8858
8859 Index split_variable = 2;
8860
8861 for(Index i = 1; i <= depth; i++, split_variable++)
8862 {
8863 split_variable = max(split_variable % input_variables, static_cast<Index>(1));
8864
8865 specific_sort(bounding_limits(i)(0), bounding_limits(i)(1), split_variable);
8866
8867 #pragma omp parallel for
8868 for(Index j = 1; j < (Index)bounding_limits(i).size()-1; j++)
8869 specific_sort(bounding_limits(i)(j)+1, bounding_limits(i)(j+1), split_variable);
8870 }
8871}
8872
8873
8874Tensor<list<Index>, 1> DataSet::calculate_bounding_boxes_neighbors(const Tensor<Tensor<type, 1>, 1>& tree,
8875 const Tensor<Index, 1>& leaves_indices,
8876 const Index& depth,
8877 const Index& k_neighbors) const
8878{
8879/*
8880 const Index used_samples_number = get_used_samples_number();
8881 const Index leaves_number = pow(2, depth);
8882
8883 Tensor<type, 1> bounding_box;
8884
8885 Tensor<type, 2> distance_matrix;
8886 Tensor<list<Index>, 1> k_nearest_neighbors(used_samples_number);
8887
8888 for(Index i = 0; i < leaves_number; i++) // Each bounding box
8889 {
8890 const Index first = leaves_indices(i);
8891 const Index last = leaves_indices(i+1);
8892 bounding_box = Tensor<type, 1>(last-first);
8893
8894 for(Index j = 0; j < last - first; j++)
8895 bounding_box(j) = tree(first+j)(0);
8896
8897 Tensor<type, 2> distance_matrix = calculate_distance_matrix(bounding_box);
8898 Tensor<list<Index>, 1> box_nearest_neighbors = calculate_k_nearest_neighbors(distance_matrix, k_neighbors);
8899
8900 for(Index j = 0; j < last - first; j++)
8901 {
8902 for(auto & element : box_nearest_neighbors(j))
8903 element = bounding_box(element);
8904
8905 k_nearest_neighbors(bounding_box(j)) = move(box_nearest_neighbors(j));
8906 }
8907 }
8908
8909 return k_nearest_neighbors;
8910*/
8911 return Tensor<list<Index>, 1>();
8912}
8913
8914
8915Tensor<list<Index>, 1> DataSet::calculate_kd_tree_neighbors(const Index& k_neighbors, const Index& min_samples_leaf) const
8916{
8917/*
8918 const Index used_samples_number = get_used_samples_number();
8919
8920 Tensor<Tensor<type, 1>, 1> tree = get_kd_tree_data();
8921
8922 const Index depth = max(floor(log(static_cast<type>(used_samples_number)/static_cast<type>(min_samples_leaf))),
8923 static_cast<type>(0.0));
8924
8925 Tensor<Tensor<Index, 1>, 1> bounding_limits = create_bounding_limits_kd_tree(depth);
8926
8927 create_kd_tree(tree, bounding_limits);
8928
8929 return calculate_bounding_boxes_neighbors(tree, bounding_limits(depth), depth, k_neighbors);
8930*/
8931 return Tensor<list<Index>, 1>();
8932}
8933
8934
8935Tensor<type, 1> DataSet::calculate_average_reachability(Tensor<list<Index>, 1>& k_nearest_indexes,
8936 const Index& k) const
8937{
8938 const Index samples_number = get_used_samples_number();
8939 const Tensor<Index, 1> samples_indices = get_used_samples_indices();
8940 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
8941
8942 Tensor<type, 1> average_reachability(samples_number);
8943 average_reachability.setZero();
8944
8945 #pragma omp parallel for
8946
8947 for(Index i = 0; i < samples_number; i++)
8948 {
8949 list<Index>::iterator neighbor_it = k_nearest_indexes(i).begin();
8950
8951 type distance_between_points;
8952 type distance_2_k_neighbor;
8953
8954 for(Index j = 0; j < k; j++, neighbor_it++)
8955 {
8956 const Index neighbor_k_index = k_nearest_indexes(*neighbor_it).back();
8957
8958 distance_between_points = calculate_euclidean_distance(input_variables_indices, i, *neighbor_it);
8959 distance_2_k_neighbor = calculate_euclidean_distance(input_variables_indices, *neighbor_it, neighbor_k_index);
8960
8961 average_reachability(i) += max(distance_between_points, distance_2_k_neighbor);
8962 }
8963
8964 average_reachability(i) /= type(k);
8965 }
8966
8967 return average_reachability;
8968}
8969
8970
8971Tensor<type, 1> DataSet::calculate_local_outlier_factor(Tensor<list<Index>, 1>& k_nearest_indexes,
8972 const Tensor<type, 1>& average_reachabilities,
8973 const Index & k) const
8974{
8975 const Index samples_number = get_used_samples_number();
8976 Tensor<type, 1> LOF_value(samples_number);
8977
8978 #pragma omp parallel for
8979
8980 for(Index i = 0; i < samples_number; i++)
8981 {
8982 type sum = type(0);
8983
8984 for(auto & neighbor_index : k_nearest_indexes(i))
8985 sum += average_reachabilities(i) / average_reachabilities(neighbor_index);
8986
8987 LOF_value(i) = sum/k ;
8988 }
8989 return LOF_value;
8990}
8991
8992
8993
9001
9002Tensor<Index, 1> DataSet::calculate_local_outlier_factor_outliers(const Index& k_neighbors,
9003 const Index& min_samples_leaf,
9004 const type& contamination) const
9005{
9006 if(k_neighbors < 0)
9007 {
9008 ostringstream buffer;
9009
9010 buffer << "OpenNN Exception: DataSet class.\n"
9011 << "Tensor<Index, 1> DataSet::calculate_local_outlier_factor_outliers(const Index&, const Index&, const type&) const method.\n"
9012 << "k_neighbors(" << k_neighbors << ") should be a positive integer value\n";
9013
9014 throw logic_error(buffer.str());
9015 }
9016
9017 if(contamination < type(0) && contamination > type(0.5))
9018 {
9019 ostringstream buffer;
9020
9021 buffer << "OpenNN Exception: DataSet class.\n"
9022 << "Tensor<Index, 1> DataSet::calculate_local_outlier_factor_outliers(const Index&, const Index&, const type&) const method.\n"
9023 << "Outlier contamination(" << contamination << ") should be a value between 0.0 and 0.5\n";
9024
9025 throw logic_error(buffer.str());
9026 }
9027
9028 const Index samples_number = get_used_samples_number();
9029
9030 bool kdtree = false;
9031
9032 Index k = min(k_neighbors, samples_number-1);
9033 Index min_samples_leaf_fix = max(min_samples_leaf, static_cast<Index>(0));
9034
9035 if(min_samples_leaf == 0 && samples_number > 5000)
9036 {
9037 min_samples_leaf_fix = 200;
9038 k = min(k, min_samples_leaf_fix-1);
9039 kdtree = true;
9040 }
9041 else if(min_samples_leaf!=0 && min_samples_leaf < samples_number/2)
9042 {
9043 k = min(k, min_samples_leaf_fix-1);
9044 kdtree = true;
9045 }
9046
9047 Tensor<list<Index>, 1> k_nearest_indexes;
9048
9049 kdtree ? k_nearest_indexes = calculate_kd_tree_neighbors(k, min_samples_leaf_fix)
9050 : k_nearest_indexes = calculate_k_nearest_neighbors(calculate_distance_matrix(get_used_samples_indices()), k);
9051
9052
9053 const Tensor<type, 1> average_reachabilities = calculate_average_reachability(k_nearest_indexes, k);
9054
9055
9056 const Tensor<type, 1> LOF_value = calculate_local_outlier_factor(k_nearest_indexes, average_reachabilities, k);
9057
9058
9059 Tensor<Index, 1> outlier_indexes;
9060
9061 contamination > type(0)
9062 ? outlier_indexes = select_outliers_via_contamination(LOF_value, contamination, true)
9063 : outlier_indexes = select_outliers_via_standard_deviation(LOF_value, type(2.0), true);
9064
9065 return outlier_indexes;
9066}
9067
9068
9069void DataSet::calculate_min_max_indices_list(list<Index>& elements, const Index& variable_index, type& min, type& max) const
9070{
9071 type value;
9072 min = max = data(elements.front(), variable_index);
9073 for(auto & sample_index : elements)
9074 {
9075 value = data(sample_index, variable_index);
9076 if(min > value) min = value;
9077 else if(max < value) max = value;
9078 }
9079}
9080
9081
9082Index DataSet::split_isolation_tree(Tensor<type, 2> & tree, list<list<Index>>& tree_simulation, list<Index>& tree_index) const
9083{
9084/*
9085 const Index current_tree_index = tree_index.front();
9086 const type current_variable = tree(current_tree_index, 1);
9087 const type division_value = tree(current_tree_index, 0);
9088
9089 list<Index> current_node_samples = tree_simulation.front();
9090
9091 list<Index> one_side_samples;
9092 list<Index> other_side_samples;
9093
9094 Index delta_next_depth_nodes = 0;
9095 Index one_side_count = 0;
9096 Index other_side_count = 0;
9097
9098
9099 for(auto & sample_index : current_node_samples)
9100 {
9101 if(data(sample_index, current_variable) < division_value)
9102 {
9103 one_side_count++;
9104 one_side_samples.push_back(sample_index);
9105 }
9106 else
9107 {
9108 other_side_count++;
9109 other_side_samples.push_back(sample_index);
9110 }
9111 }
9112
9113 if(one_side_count != 0)
9114 {
9115 if(one_side_count != 1)
9116 {
9117 tree_simulation.push_back(one_side_samples);
9118 tree_index.push_back(current_tree_index*2+1);
9119 delta_next_depth_nodes++;
9120 }
9121
9122 if(other_side_count != 1)
9123 {
9124 tree_simulation.push_back(other_side_samples);
9125 tree_index.push_back(current_tree_index*2+2);
9126 delta_next_depth_nodes++;
9127 }
9128
9129 tree(current_tree_index*2+1, 2) = type(one_side_count);
9130 tree(current_tree_index*2+2, 2) = type(other_side_count);
9131 }
9132
9133
9134 return delta_next_depth_nodes;
9135*/
9136 return Index();
9137}
9138
9139
9140Tensor<type, 2> DataSet::create_isolation_tree(const Tensor<Index, 1>& indices, const Index& max_depth) const
9141{
9142 const Index used_samples_number = indices.size();
9143
9144 const Index variables_number = get_input_variables_number();
9145 const Tensor<Index, 1> input_variables_indices = get_input_variables_indices();
9146
9147 list<list<Index>> tree_simulation;
9148 list<Index> tree_index;
9149 list<Index> current_node_samples;
9150
9151 Tensor<type, 2> tree(pow(2, max_depth+1) - 1, 3);
9152 tree.setConstant(numeric_limits<type>::infinity());
9153
9154 for(Index i = 0; i < used_samples_number; i++)
9155 current_node_samples.push_back(indices(i));
9156
9157 tree_simulation.push_back(current_node_samples);
9158 tree(0, 2) = type(used_samples_number);
9159 tree_index.push_back(0);
9160
9161 current_node_samples.clear();
9162
9163 Index current_depth_nodes = 1;
9164 Index next_depth_nodes = 0;
9165 Index current_variable_index = input_variables_indices(rand() % variables_number);
9166 Index current_depth = 0;
9167
9168 Index current_index;
9169
9170 type min, max;
9171
9172 while(current_depth < max_depth && !(tree_simulation.empty()))
9173 {
9174 current_node_samples = tree_simulation.front();
9175 current_index = tree_index.front();
9176
9177 calculate_min_max_indices_list(current_node_samples, current_variable_index, min, max);
9178
9179 tree(current_index, 0) = static_cast<type>((max-min)*(type(rand())/static_cast<type>(RAND_MAX))) + min;
9180
9181 tree(current_index, 1) = type(current_variable_index);
9182
9183 next_depth_nodes += split_isolation_tree(tree, tree_simulation, tree_index);
9184
9185 tree_simulation.pop_front();
9186 tree_index.pop_front();
9187
9188 current_depth_nodes--;
9189
9190 if(current_depth_nodes == 0)
9191 {
9192 current_depth++;
9193 swap(current_depth_nodes, next_depth_nodes);
9194 current_variable_index = input_variables_indices(rand() % variables_number);
9195 }
9196 }
9197
9198 return tree;
9199}
9200
9201
9202Tensor<Tensor<type, 2>, 1> DataSet::create_isolation_forest(const Index& trees_number, const Index& sub_set_size, const Index& max_depth) const
9203{
9204 const Tensor<Index, 1> indices = get_used_samples_indices();
9205 const Index samples_number = get_used_samples_number();
9206 Tensor<Tensor<type, 2>, 1> forest(trees_number);
9207
9208 std::random_device rng;
9209 std::mt19937 urng(rng());
9210
9211 for(Index i = 0; i < trees_number; i++)
9212 {
9213 Tensor<Index, 1> sub_set_indices(sub_set_size);
9214 Tensor<Index, 1> aux_indices = indices;
9215 std::shuffle(aux_indices.data(), aux_indices.data()+samples_number, urng);
9216
9217 for(Index j = 0; j < sub_set_size; j++)
9218 sub_set_indices(j) = aux_indices(j);
9219
9220 forest(i) = create_isolation_tree(sub_set_indices, max_depth);
9221
9222 }
9223 return forest;
9224}
9225
9226
9227type DataSet::calculate_tree_path(const Tensor<type, 2>& tree, const Index& sample_index,
9228 const Index& tree_depth) const
9229{
9230 Index current_index = 0;
9231 Index current_depth = 0;
9232 const Index tree_length = tree.dimensions()[0];
9233
9234 type samples;
9235 type value;
9236
9237 while(current_depth < tree_depth)
9238 {
9239 if(tree(current_index, 2) == type(1))
9240 {
9241 return type(current_depth);
9242 }
9243 else if(current_index*2 >= tree_length ||
9244 (tree(current_index*2+1, 2) == numeric_limits<type>::infinity())
9245 ) //Next node doesn't exist or node is leaf
9246 {
9247 samples = tree(current_index, 2);
9248
9249 return type(log(samples- type(1)) - (type(2) *(samples- type(1)))/samples + type(0.5772) + type(current_depth));
9250 }
9251
9252
9253 value = data(sample_index, static_cast<Index>(tree(current_index, 1)));
9254
9255 (value < tree(current_index, 0)) ? current_index = current_index*2 + 1
9256 : current_index = current_index*2 + 2;
9257
9258 current_depth++;
9259 }
9260
9261 samples = tree(current_index, 2);
9262 if(samples == type(1))
9263 return type(current_depth);
9264 else
9265 return type(log(samples- type(1))-(type(2.0) *(samples- type(1)))/samples + type(0.5772) + type(current_depth));
9266}
9267
9268
9269Tensor<type, 1> DataSet::calculate_average_forest_paths(const Tensor<Tensor<type, 2>, 1>& forest, const Index& tree_depth) const
9270{
9271 const Index samples_number = get_used_samples_number();
9272 const Index n_trees = forest.dimensions()[0];
9273 Tensor<type, 1> average_paths(samples_number);
9274 average_paths.setZero();
9275
9276 # pragma omp parallel for
9277 for(Index i = 0; i < samples_number; i++)
9278 {
9279 for(Index j = 0; j < n_trees; j++)
9280 average_paths(i) += calculate_tree_path(forest(j), i, tree_depth);
9281
9282 average_paths(i) /= type(n_trees);
9283 }
9284 return average_paths;
9285}
9286
9287
9288Tensor<Index, 1> DataSet::calculate_isolation_forest_outliers(const Index& n_trees,
9289 const Index& subs_set_samples,
9290 const type& contamination) const
9291{
9292 const Index samples_number = get_used_samples_number();
9293 const Index fixed_subs_set_samples = min(samples_number, subs_set_samples);
9294 const Index max_depth = ceil(log2(fixed_subs_set_samples))*2;
9295 const Tensor<Tensor<type, 2>, 1> forest = create_isolation_forest(n_trees, fixed_subs_set_samples, max_depth);
9296
9297 const Tensor<type, 1> average_paths = calculate_average_forest_paths(forest, max_depth);
9298
9299 Tensor<Index, 1> outlier_indexes;
9300
9301 contamination > type(0)
9302 ? outlier_indexes = select_outliers_via_contamination(average_paths, contamination, false)
9303 : outlier_indexes = select_outliers_via_standard_deviation(average_paths, type(2.0), false);
9304
9305 return outlier_indexes;
9306}
9307
9308
9309
9314
9315Tensor<type, 2> DataSet::calculate_autocorrelations(const Index& lags_number) const
9316{
9317 const Index samples_number = time_series_data.dimension(0);
9318
9319 if(lags_number > samples_number)
9320 {
9321 ostringstream buffer;
9322
9323 buffer << "OpenNN Exception: DataSet class.\n"
9324 << "Tensor<type, 2> calculate_cross_correlations(const Index& lags_number) const method.\n"
9325 << "Lags number(" << lags_number
9326 << ") is greater than the number of samples("
9327 << samples_number << ") \n";
9328
9329 throw logic_error(buffer.str());
9330 }
9331
9332 const Index columns_number = get_time_series_columns_number();
9333
9334 const Index input_columns_number = get_input_time_series_columns_number();
9335 const Index target_columns_number = get_target_time_series_columns_number();
9336
9337 const Index input_target_columns_number = input_columns_number + target_columns_number;
9338
9339 const Tensor<Index, 1> input_columns_indices = get_input_time_series_columns_indices();
9340 const Tensor<Index, 1> target_columns_indices = get_target_time_series_columns_indices();
9341
9342 Index input_target_numeric_column_number = 0;
9343
9344 int counter = 0;
9345
9346 for(Index i = 0; i < input_target_columns_number; i++)
9347 {
9348 if(i < input_columns_number)
9349 {
9350 const Index column_index = input_columns_indices(i);
9351
9352 const ColumnType input_column_type = time_series_columns(column_index).type;
9353
9354 if(input_column_type == ColumnType::Numeric)
9355 {
9356 input_target_numeric_column_number++;
9357 }
9358 }
9359 else
9360 {
9361 const Index column_index = target_columns_indices(counter);
9362
9363 const ColumnType target_column_type = time_series_columns(column_index).type;
9364
9365 if(target_column_type == ColumnType::Numeric)
9366 {
9367 input_target_numeric_column_number++;
9368 }
9369
9370 counter++;
9371 }
9372 }
9373
9374 Index new_lags_number;
9375
9376 if(samples_number == lags_number || samples_number < lags_number)
9377 {
9378 new_lags_number = lags_number - 2;
9379 }
9380 else if(samples_number == lags_number + 1)
9381 {
9382 new_lags_number = lags_number - 1;
9383 }
9384 else
9385 {
9386 new_lags_number = lags_number;
9387 }
9388
9389 Tensor<type, 2> autocorrelations(input_target_numeric_column_number, new_lags_number);
9390 Tensor<type, 1> autocorrelations_vector(new_lags_number);
9391 Tensor<type, 2> input_i;
9392 Index counter_i = 0;
9393
9394 for(Index i = 0; i < columns_number; i++)
9395 {
9396 if(time_series_columns(i).column_use != VariableUse::UnusedVariable && time_series_columns(i).type == ColumnType::Numeric)
9397 {
9398 input_i = get_time_series_column_data(i);
9399 cout << "Calculating " << time_series_columns(i).name << " autocorrelations" << endl;
9400 }
9401 else
9402 {
9403 continue;
9404 }
9405
9406 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
9407
9408 autocorrelations_vector = OpenNN::autocorrelations(thread_pool_device, current_input_i, new_lags_number);
9409
9410 for(Index j = 0; j < new_lags_number; j++)
9411 {
9412 autocorrelations (counter_i, j) = autocorrelations_vector(j) ;
9413 }
9414
9415 counter_i++;
9416 }
9417
9418 return autocorrelations;
9419}
9420
9421
9423
9424Tensor<type, 3> DataSet::calculate_cross_correlations(const Index& lags_number) const
9425{
9426 const Index samples_number = time_series_data.dimension(0);
9427
9428 if(lags_number > samples_number)
9429 {
9430 ostringstream buffer;
9431
9432 buffer << "OpenNN Exception: DataSet class.\n"
9433 << "Tensor<type, 3> calculate_cross_correlations(const Index& lags_number) const method.\n"
9434 << "Lags number(" << lags_number
9435 << ") is greater than the number of samples("
9436 << samples_number << ") \n";
9437
9438 throw logic_error(buffer.str());
9439 }
9440
9441 const Index columns_number = get_time_series_columns_number();
9442
9443 const Index input_columns_number = get_input_time_series_columns_number();
9444 const Index target_columns_number = get_target_time_series_columns_number();
9445
9446 const Index input_target_columns_number = input_columns_number + target_columns_number;
9447
9448 const Tensor<Index, 1> input_columns_indices = get_input_time_series_columns_indices();
9449 const Tensor<Index, 1> target_columns_indices = get_target_time_series_columns_indices();
9450
9451 Index input_target_numeric_column_number = 0;
9452 int counter = 0;
9453
9454 for(Index i = 0; i < input_target_columns_number; i++)
9455 {
9456 if(i < input_columns_number)
9457 {
9458 const Index column_index = input_columns_indices(i);
9459
9460 const ColumnType input_column_type = time_series_columns(column_index).type;
9461
9462 if(input_column_type == ColumnType::Numeric)
9463 {
9464 input_target_numeric_column_number++;
9465 }
9466 }
9467 else
9468 {
9469 const Index column_index = target_columns_indices(counter);
9470
9471 ColumnType target_column_type = time_series_columns(column_index).type;
9472
9473 if(target_column_type == ColumnType::Numeric)
9474 {
9475 input_target_numeric_column_number++;
9476 }
9477 counter++;
9478 }
9479 }
9480
9481 Index new_lags_number;
9482
9483 if(samples_number == lags_number)
9484 {
9485 new_lags_number = lags_number - 2;
9486 }
9487 else if(samples_number == lags_number + 1)
9488 {
9489 new_lags_number = lags_number - 1;
9490 }
9491 else
9492 {
9493 new_lags_number = lags_number;
9494 }
9495
9496 Tensor<type, 3> cross_correlations(input_target_numeric_column_number, input_target_numeric_column_number, new_lags_number);
9497 Tensor<type, 1> cross_correlations_vector(new_lags_number);
9498 Tensor<type, 2> input_i;
9499 Tensor<type, 2> input_j;
9500 Index counter_i = 0;
9501 Index counter_j = 0;
9502
9503 for(Index i = 0; i < columns_number; i++)
9504 {
9505 if(time_series_columns(i).column_use != VariableUse::UnusedVariable && time_series_columns(i).type == ColumnType::Numeric)
9506 {
9507 input_i = get_time_series_column_data(i);
9508
9509 if(display) cout << "Calculating " << time_series_columns(i).name << " cross correlations:" << endl;
9510
9511 counter_j = 0;
9512 }
9513 else
9514 {
9515 continue;
9516 }
9517
9518 for(Index j = 0; j < columns_number; j++)
9519 {
9520 if(time_series_columns(j).column_use != VariableUse::UnusedVariable && time_series_columns(j).type == ColumnType::Numeric)
9521 {
9522 input_j = get_time_series_column_data(j);
9523
9524 if(display) cout << " -VS- " << time_series_columns(j).name << endl;
9525
9526 }
9527 else
9528 {
9529 continue;
9530 }
9531
9532 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
9533 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
9534
9535 cross_correlations_vector = OpenNN::cross_correlations(thread_pool_device, current_input_i, current_input_j, new_lags_number);
9536
9537 for(Index k = 0; k < new_lags_number; k++)
9538 {
9539 cross_correlations (counter_i, counter_j, k) = cross_correlations_vector(k) ;
9540 }
9541
9542 counter_j++;
9543 }
9544
9545 counter_i++;
9546
9547 }
9548
9549 return cross_correlations;
9550}
9551
9552
9557
9558void DataSet::generate_constant_data(const Index& samples_number, const Index& variables_number, const type& value)
9559{
9560 set(samples_number, variables_number);
9561
9562 data.setConstant(value);
9563
9565}
9566
9567
9573
9574void DataSet::generate_random_data(const Index& samples_number, const Index& variables_number)
9575{
9576 set(samples_number, variables_number);
9577
9578 data.setRandom();
9579}
9580
9581
9586
9587void DataSet::generate_sequential_data(const Index& samples_number, const Index& variables_number)
9588{
9589 set(samples_number, variables_number);
9590
9591 for(Index i = 0; i < samples_number; i++)
9592 {
9593 for(Index j = 0; j < variables_number; j++)
9594 {
9595 data(i,j) = static_cast<type>(j);
9596 }
9597 }
9598}
9599
9600
9606
9607void DataSet::generate_Rosenbrock_data(const Index& samples_number, const Index& variables_number)
9608{
9609 const Index inputs_number = variables_number-1;
9610
9611 set(samples_number, variables_number);
9612
9613 data.setRandom();
9614
9615 #pragma omp parallel for
9616
9617 for(Index i = 0; i < samples_number; i++)
9618 {
9619 type rosenbrock(0);
9620
9621 for(Index j = 0; j < inputs_number-1; j++)
9622 {
9623 const type value = data(i,j);
9624 const type next_value = data(i,j+1);
9625
9626 rosenbrock += (type(1) - value)*(type(1) - value) + type(100)*(next_value-value*value)*(next_value-value*value);
9627 }
9628
9629 data(i, inputs_number) = rosenbrock;
9630 }
9631
9633
9634}
9635
9636
9637void DataSet::generate_sum_data(const Index& samples_number, const Index& variables_number)
9638{
9639 set(samples_number,variables_number);
9640
9641 data.setRandom();
9642
9643 for(Index i = 0; i < samples_number; i++)
9644 {
9645 for(Index j = 0; j < variables_number-1; j++)
9646 {
9647 data(i,variables_number-1) += data(i,j);
9648 }
9649 }
9650
9651 set_default();
9652}
9653
9654
9661
9662Tensor<Index, 1> DataSet::filter_data(const Tensor<type, 1>& minimums, const Tensor<type, 1>& maximums)
9663{
9664 const Tensor<Index, 1> used_variables_indices = get_used_variables_indices();
9665
9666 const Index used_variables_number = used_variables_indices.size();
9667
9668#ifdef OPENNN_DEBUG
9669
9670 if(minimums.size() != used_variables_number)
9671 {
9672 ostringstream buffer;
9673
9674 buffer << "OpenNN Exception: DataSet class.\n"
9675 << "Tensor<Index, 1> filter_data(const Tensor<type, 1>&, const Tensor<type, 1>&) method.\n"
9676 << "Size of minimums(" << minimums.size() << ") is not equal to number of variables(" << used_variables_number << ").\n";
9677
9678 throw logic_error(buffer.str());
9679 }
9680
9681 if(maximums.size() != used_variables_number)
9682 {
9683 ostringstream buffer;
9684
9685 buffer << "OpenNN Exception: DataSet class.\n"
9686 << "Tensor<Index, 1> filter_data(const Tensor<type, 1>&, const Tensor<type, 1>&) method.\n"
9687 << "Size of maximums(" << maximums.size() << ") is not equal to number of variables(" << used_variables_number << ").\n";
9688
9689 throw logic_error(buffer.str());
9690 }
9691
9692#endif
9693
9694 const Index samples_number = get_samples_number();
9695
9696 Tensor<type, 1> filtered_indices(samples_number);
9697 filtered_indices.setZero();
9698
9699 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
9700 const Index used_samples_number = used_samples_indices.size();
9701
9702 Index sample_index = 0;
9703
9704 for(Index i = 0; i < used_variables_number; i++)
9705 {
9706 const Index variable_index = used_variables_indices(i);
9707
9708 for(Index j = 0; j < used_samples_number; j++)
9709 {
9710 sample_index = used_samples_indices(j);
9711
9712 if(get_sample_use(sample_index) == SampleUse::UnusedSample) continue;
9713
9714 if(isnan(data(sample_index, variable_index))) continue;
9715
9716 if(abs(data(sample_index, variable_index) - minimums(i)) <= type(NUMERIC_LIMITS_MIN)
9717 || abs(data(sample_index, variable_index) - maximums(i)) <= type(NUMERIC_LIMITS_MIN)) continue;
9718
9719 if(data(sample_index,variable_index) < minimums(i)
9720 || data(sample_index,variable_index) > maximums(i))
9721 {
9722 filtered_indices(sample_index) = type(1);
9723
9724 set_sample_use(sample_index, SampleUse::UnusedSample);
9725 }
9726 }
9727 }
9728
9729 const Index filtered_samples_number =
9730 static_cast<Index>(count_if(filtered_indices.data(),
9731 filtered_indices.data()+filtered_indices.size(), [](type value)
9732 {return value > static_cast<type>(0.5);}));
9733
9734 Tensor<Index, 1> filtered_samples_indices(filtered_samples_number);
9735
9736 Index index = 0;
9737
9738 for(Index i = 0; i < samples_number; i++)
9739 {
9740 if(filtered_indices(i) > static_cast<type>(0.5))
9741 {
9742 filtered_samples_indices(index) = i;
9743 index++;
9744 }
9745 }
9746
9747 return filtered_samples_indices;
9748}
9749
9750
9752
9754{
9755 const Index samples_number = get_samples_number();
9756
9757 #pragma omp parallel for
9758
9759 for(Index i = 0; i <samples_number; i++)
9760 {
9761 if(has_nan_row(i)) set_sample_use(i, "Unused");
9762 }
9763}
9764
9765
9767
9769{
9770 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
9771 const Tensor<Index, 1> used_variables_indices = get_used_variables_indices();
9772
9773 const Tensor<type, 1> means = mean(data, used_samples_indices, used_variables_indices);
9774
9775 const Index samples_number = used_samples_indices.size();
9776 const Index variables_number = used_variables_indices.size();
9777
9778 Index current_variable;
9779 Index current_sample;
9780
9781#pragma omp parallel for schedule(dynamic)
9782
9783 for(Index j = 0; j < variables_number; j++)
9784 {
9785 current_variable = used_variables_indices(j);
9786
9787 for(Index i = 0; i < samples_number; i++)
9788 {
9789 current_sample = used_samples_indices(i);
9790
9791 if(isnan(data(current_sample, current_variable)))
9792 {
9793 data(current_sample,current_variable) = means(j);
9794 }
9795 }
9796 }
9797}
9798
9799
9801
9803{
9804 const Tensor<Index, 1> used_samples_indices = get_used_samples_indices();
9805 const Tensor<Index, 1> used_variables_indices = get_used_columns_indices();
9806
9807 const Tensor<type, 1> medians = median(data, used_samples_indices, used_variables_indices);
9808
9809 const Index variables_number = used_variables_indices.size();
9810 const Index samples_number = used_samples_indices.size();
9811
9812#pragma omp parallel for schedule(dynamic)
9813
9814 for(Index j = 0; j < variables_number; j++)
9815 {
9816 for(Index i = 0 ; i < samples_number ; i++)
9817 {
9818 if(isnan(data(used_samples_indices(i),used_variables_indices(j)))) data(used_samples_indices(i),used_variables_indices(j)) = medians(j);
9819 }
9820 }
9821}
9822
9823
9827
9829{
9830 switch(missing_values_method)
9831 {
9832 case MissingValuesMethod::Unuse:
9833
9835
9836 break;
9837
9838 case MissingValuesMethod::Mean:
9839
9841
9842 break;
9843
9844 case MissingValuesMethod::Median:
9845
9847
9848 break;
9849 }
9850}
9851
9852
9853void DataSet::read_csv()
9854{
9855 read_csv_1();
9856
9857 if(!has_time_columns() && !has_categorical_columns())
9858 {
9859 read_csv_2_simple();
9860
9861 read_csv_3_simple();
9862 }
9863 else
9864 {
9865 read_csv_2_complete();
9866
9867 read_csv_3_complete();
9868 }
9869
9870}
9871
9872
9873Tensor<string, 1> DataSet::get_default_columns_names(const Index& columns_number)
9874{
9875 Tensor<string, 1> columns_names(columns_number);
9876
9877 for(Index i = 0; i < columns_number; i++)
9878 {
9879 ostringstream buffer;
9880
9881 buffer << "column_" << i+1;
9882
9883 columns_names(i) = buffer.str();
9884 }
9885
9886 return columns_names;
9887}
9888
9889
9890void DataSet::read_csv_1()
9891{
9892 if(data_file_name.empty())
9893 {
9894 ostringstream buffer;
9895
9896 buffer << "OpenNN Exception: DataSet class.\n"
9897 << "void read_csv() method.\n"
9898 << "Data file name is empty.\n";
9899
9900 throw logic_error(buffer.str());
9901 }
9902
9903 ifstream file(data_file_name.c_str());
9904
9905 if(!file.is_open())
9906 {
9907 ostringstream buffer;
9908
9909 buffer << "OpenNN Exception: DataSet class.\n"
9910 << "void read_csv() method.\n"
9911 << "Cannot open data file: " << data_file_name << "\n";
9912
9913 throw logic_error(buffer.str());
9914 }
9915
9916 const char separator_char = get_separator_char();
9917
9918 if(display) cout << "Setting data file preview..." << endl;
9919
9920 const Index lines_number = has_columns_names ? 4 : 3;
9921
9922 data_file_preview.resize(lines_number);
9923
9924 string line;
9925
9926 Index lines_count = 0;
9927
9928 while(file.good())
9929 {
9930 getline(file, line);
9931
9932 trim(line);
9933
9934 erase(line, '"');
9935
9936 if(line.empty()) continue;
9937
9938 check_separators(line);
9939
9940 check_special_characters(line);
9941
9942 data_file_preview(lines_count) = get_tokens(line, separator_char);
9943
9944 lines_count++;
9945
9946 if(lines_count == lines_number) break;
9947 }
9948
9949 file.close();
9950
9951 // Check empty file @todo, size() methods returns 0
9952
9953 if(data_file_preview(0).size() == 0)
9954 {
9955 ostringstream buffer;
9956
9957 buffer << "OpenNN Exception: DataSet class.\n"
9958 << "void read_csv_1() method.\n"
9959 << "File " << data_file_name << " is empty.\n";
9960
9961 throw logic_error(buffer.str());
9962 }
9963
9964 // Set rows labels and columns names
9965
9966 if(display) cout << "Setting rows labels..." << endl;
9967
9968 string first_name = data_file_preview(0)(0);
9969 transform(first_name.begin(), first_name.end(), first_name.begin(), ::tolower);
9970
9971 const Index columns_number = has_rows_labels ? data_file_preview(0).size()-1 : data_file_preview(0).size();
9972
9973 columns.resize(columns_number);
9974
9975 // Check if header has numeric value
9976
9977 if(has_columns_names && has_numbers(data_file_preview(0)))
9978 {
9979 ostringstream buffer;
9980
9981 buffer << "OpenNN Exception: DataSet class.\n"
9982 << "void read_csv_1() method.\n"
9983 << "Some columns names are numeric.\n";
9984
9985 throw logic_error(buffer.str());
9986 }
9987
9988 // Columns names
9989
9990 if(display) cout << "Setting columns names..." << endl;
9991
9993 {
9994 has_rows_labels ? set_columns_names(data_file_preview(0).slice(Eigen::array<Eigen::Index, 1>({1}),
9995 Eigen::array<Eigen::Index, 1>({data_file_preview(0).size()-1})))
9996 : set_columns_names(data_file_preview(0));
9997 }
9998 else
9999 {
10000 set_columns_names(get_default_columns_names(columns_number));
10001 }
10002
10003 // Columns types
10004
10005 if(display) cout << "Setting columns types..." << endl;
10006
10007 Index column_index = 0;
10008
10009 for(Index i = 0; i < data_file_preview(0).dimension(0); i++)
10010 {
10011 if(has_rows_labels && i == 0) continue;
10012
10013 if((is_date_time_string(data_file_preview(1)(i)) && data_file_preview(1)(i) != missing_values_label)
10014 || (is_date_time_string(data_file_preview(2)(i)) && data_file_preview(2)(i) != missing_values_label)
10015 || (is_date_time_string(data_file_preview(lines_number-2)(i)) && data_file_preview(lines_number-2)(i) != missing_values_label)
10016 || (is_date_time_string(data_file_preview(lines_number-1)(i)) && data_file_preview(lines_number-1)(i) != missing_values_label))
10017/*
10018 || (data_file_preview(0)(i).find("time") != string::npos && is_numeric_string(data_file_preview(1)(i)) && is_numeric_string(data_file_preview(2)(i))
10019 && is_numeric_string(data_file_preview(lines_number-2)(i))
10020 && is_numeric_string(data_file_preview(lines_number-2)(i)) ))
10021*/
10022 {
10023
10024
10025 columns(column_index).type = ColumnType::DateTime;
10026 column_index++;
10027 }
10028 else if(((is_numeric_string(data_file_preview(1)(i)) && data_file_preview(1)(i) != missing_values_label) || data_file_preview(1)(i).empty())
10029 || ((is_numeric_string(data_file_preview(2)(i)) && data_file_preview(2)(i) != missing_values_label) || data_file_preview(1)(i).empty())
10030 || ((is_numeric_string(data_file_preview(lines_number-2)(i)) && data_file_preview(lines_number-2)(i) != missing_values_label) || data_file_preview(1)(i).empty())
10031 || ((is_numeric_string(data_file_preview(lines_number-1)(i)) && data_file_preview(lines_number-1)(i) != missing_values_label) || data_file_preview(1)(i).empty()))
10032 {
10033 columns(column_index).type = ColumnType::Numeric;
10034 column_index++;
10035 }
10036 else
10037 {
10038 columns(column_index).type = ColumnType::Categorical;
10039 column_index++;
10040 }
10041 }
10042
10043 if(time_column != "")
10044 {
10045 set_column_type(time_column, DataSet::ColumnType::DateTime);
10046 }
10047}
10048
10049
10050void DataSet::read_csv_2_simple()
10051{
10052 ifstream file(data_file_name.c_str());
10053
10054 if(!file.is_open())
10055 {
10056 ostringstream buffer;
10057
10058 buffer << "OpenNN Exception: DataSet class.\n"
10059 << "void read_csv_2_simple() method.\n"
10060 << "Cannot open data file: " << data_file_name << "\n";
10061
10062 throw logic_error(buffer.str());
10063 }
10064
10065 string line;
10066 Index line_number = 0;
10067
10069 {
10070 while(file.good())
10071 {
10072 line_number++;
10073
10074 getline(file, line);
10075
10076 trim(line);
10077
10078 erase(line, '"');
10079
10080 if(line.empty()) continue;
10081
10082 break;
10083 }
10084 }
10085
10086 Index samples_count = 0;
10087
10088 Index tokens_count;
10089
10090 if(display) cout << "Setting data dimensions..." << endl;
10091
10092 const char separator_char = get_separator_char();
10093
10094 const Index columns_number = get_columns_number();
10095 const Index raw_columns_number = has_rows_labels ? columns_number + 1 : columns_number;
10096
10097 while(file.good())
10098 {
10099 line_number++;
10100
10101 getline(file, line);
10102
10103 trim(line);
10104
10105 //erase(line, '"');
10106
10107 if(line.empty()) continue;
10108
10109 tokens_count = count_tokens(line, separator_char);
10110
10111 if(tokens_count != raw_columns_number)
10112 {
10113 ostringstream buffer;
10114
10115 buffer << "OpenNN Exception: DataSet class.\n"
10116 << "void read_csv_2_simple() method.\n"
10117 << "Line " << line_number << ": Size of tokens("
10118 << tokens_count << ") is not equal to number of columns("
10119 << raw_columns_number << ").\n";
10120
10121 throw logic_error(buffer.str());
10122 }
10123
10124 samples_count++;
10125 }
10126
10127 file.close();
10128
10129 data.resize(samples_count, columns_number);
10130
10132
10133 samples_uses.resize(samples_count);
10134 samples_uses.setConstant(SampleUse::Training);
10135
10137}
10138
10139
10140void DataSet::read_csv_3_simple()
10141{
10142 ifstream file(data_file_name.c_str());
10143
10144 if(!file.is_open())
10145 {
10146 ostringstream buffer;
10147
10148 buffer << "OpenNN Exception: DataSet class.\n"
10149 << "void read_csv_2_simple() method.\n"
10150 << "Cannot open data file: " << data_file_name << "\n";
10151
10152 throw logic_error(buffer.str());
10153 }
10154
10155 const bool is_float = is_same<type, float>::value;
10156
10157 const char separator_char = get_separator_char();
10158
10159 string line;
10160
10161 // Read header
10162
10164 {
10165 while(file.good())
10166 {
10167 getline(file, line);
10168
10169 if(line.empty()) continue;
10170
10171 break;
10172 }
10173 }
10174
10175 // Read data
10176
10177 Index j = 0;
10178
10179 const Index raw_columns_number = has_rows_labels ? get_columns_number() + 1 : get_columns_number();
10180
10181 Tensor<string, 1> tokens(raw_columns_number);
10182
10183 const Index samples_number = data.dimension(0);
10184
10185 if(has_rows_labels) rows_labels.resize(samples_number);
10186
10187 if(display) cout << "Reading data..." << endl;
10188
10189 Index sample_index = 0;
10190 Index column_index = 0;
10191
10192 while(file.good())
10193 {
10194 getline(file, line);
10195
10196 trim(line);
10197
10198 erase(line, '"');
10199
10200 if(line.empty()) continue;
10201
10202 fill_tokens(line, separator_char, tokens);
10203
10204 for(j = 0; j < raw_columns_number; j++)
10205 {
10206 trim(tokens(j));
10207
10208 if(has_rows_labels && j == 0)
10209 {
10210 rows_labels(sample_index) = tokens(j);
10211 }
10212 else if(tokens(j) == missing_values_label || tokens(j).empty())
10213 {
10214 data(sample_index, column_index) = static_cast<type>(NAN);
10215 column_index++;
10216 }
10217 else if(is_float)
10218 {
10219 data(sample_index, column_index) = type(strtof(tokens(j).data(), NULL));
10220 column_index++;
10221 }
10222 else
10223 {
10224 data(sample_index, column_index) = type(stof(tokens(j)));
10225 column_index++;
10226 }
10227 }
10228
10229 column_index = 0;
10230 sample_index++;
10231 }
10232
10233 const Index data_file_preview_index = has_columns_names ? 3 : 2;
10234
10235 data_file_preview(data_file_preview_index) = tokens;
10236
10237 file.close();
10238
10239 if(display) cout << "Data read succesfully..." << endl;
10240
10241 // Check Constant
10242
10243 check_constant_columns();
10244
10245 // Check Binary
10246
10247 if(display) cout << "Checking binary columns..." << endl;
10248
10249 set_binary_simple_columns();
10250}
10251
10252
10253void DataSet::read_csv_2_complete()
10254{
10255 ifstream file(data_file_name.c_str());
10256
10257 if(!file.is_open())
10258 {
10259 ostringstream buffer;
10260
10261 buffer << "OpenNN Exception: DataSet class.\n"
10262 << "void read_csv_2_complete() method.\n"
10263 << "Cannot open data file: " << data_file_name << "\n";
10264
10265 throw logic_error(buffer.str());
10266 }
10267
10268 const char separator_char = get_separator_char();
10269
10270 string line;
10271
10272 Tensor<string, 1> tokens;
10273
10274 Index lines_count = 0;
10275 Index tokens_count;
10276
10277 const Index columns_number = columns.size();
10278
10279 for(unsigned j = 0; j < columns_number; j++)
10280 {
10281 if(columns(j).type != ColumnType::Categorical)
10282 {
10283 columns(j).column_use = VariableUse::Input;
10284 }
10285 }
10286
10287 // Skip header
10288
10290 {
10291 while(file.good())
10292 {
10293 getline(file, line);
10294
10295 trim(line);
10296
10297 if(line.empty()) continue;
10298
10299 break;
10300 }
10301 }
10302
10303 // Read data
10304
10305 if(display) cout << "Setting data dimensions..." << endl;
10306
10307 const Index raw_columns_number = has_rows_labels ? columns_number + 1 : columns_number;
10308
10309 Index column_index = 0;
10310
10311 while(file.good())
10312 {
10313 getline(file, line);
10314
10315 trim(line);
10316
10317 if(line.empty()) continue;
10318
10319 tokens = get_tokens(line, separator_char);
10320
10321 tokens_count = tokens.size();
10322
10323 if(static_cast<unsigned>(tokens_count) != raw_columns_number)
10324 {
10325 const string message =
10326 "Sample " + to_string(lines_count+1) + " error:\n"
10327 "Size of tokens (" + to_string(tokens_count) + ") is not equal to number of columns (" + to_string(raw_columns_number) + ").\n"
10328 "Please check the format of the data file (e.g: Use of commas both as decimal and column separator)";
10329
10330 throw logic_error(message);
10331 }
10332
10333 for(unsigned j = 0; j < raw_columns_number; j++)
10334 {
10335 if(has_rows_labels && j == 0) continue;
10336
10337 trim(tokens(j));
10338
10339 if(columns(column_index).type == ColumnType::Categorical)
10340 {
10341 if(find(columns(column_index).categories.data(), columns(column_index).categories.data() + columns(column_index).categories.size(), tokens(j)) == (columns(column_index).categories.data() + columns(column_index).categories.size()))
10342 {
10343 if(tokens(j) == missing_values_label)
10344 {
10345 column_index++;
10346 continue;
10347 }
10348
10349 columns(column_index).add_category(tokens(j));
10350 }
10351 }
10352
10353 column_index++;
10354 }
10355
10356 column_index = 0;
10357
10358 lines_count++;
10359 }
10360
10361 if(display) cout << "Setting types..." << endl;
10362
10363 for(Index j = 0; j < columns_number; j++)
10364 {
10365 if(columns(j).type == ColumnType::Categorical)
10366 {
10367 if(columns(j).categories.size() == 2)
10368 {
10369 columns(j).type = ColumnType::Binary;
10370 }
10371 }
10372 }
10373
10374 file.close();
10375
10376 const Index samples_number = static_cast<unsigned>(lines_count);
10377
10378 const Index variables_number = get_variables_number();
10379
10380 data.resize(static_cast<Index>(samples_number), variables_number);
10381 data.setZero();
10382
10383 if(has_rows_labels) rows_labels.resize(samples_number);
10384
10386
10387 samples_uses.resize(static_cast<Index>(samples_number));
10388
10389 samples_uses.setConstant(SampleUse::Training);
10390
10392}
10393
10394
10395void DataSet::read_csv_3_complete()
10396{
10397 ifstream file(data_file_name.c_str());
10398
10399 if(!file.is_open())
10400 {
10401 ostringstream buffer;
10402
10403 buffer << "OpenNN Exception: DataSet class.\n"
10404 << "void read_csv_3_complete() method.\n"
10405 << "Cannot open data file: " << data_file_name << "\n";
10406
10407 throw logic_error(buffer.str());
10408 }
10409
10410 const char separator_char = get_separator_char();
10411
10412 const Index columns_number = columns.size();
10413
10414 const Index raw_columns_number = has_rows_labels ? columns_number+1 : columns_number;
10415
10416 string line;
10417
10418 Tensor<string, 1> tokens;
10419
10420 string token;
10421
10422 unsigned sample_index = 0;
10423 unsigned variable_index = 0;
10424 unsigned column_index = 0;
10425
10426 // Skip header
10427
10429 {
10430 while(file.good())
10431 {
10432 getline(file, line);
10433
10434 trim(line);
10435
10436 if(line.empty()) continue;
10437
10438 break;
10439 }
10440 }
10441
10442 // Read data
10443
10444 if(display) cout << "Reading data..." << endl;
10445
10446 while(file.good())
10447 {
10448 getline(file, line);
10449
10450 trim(line);
10451
10452 erase(line, '"');
10453
10454 if(line.empty()) continue;
10455
10456 tokens = get_tokens(line, separator_char);
10457
10458 variable_index = 0;
10459 column_index = 0;
10460
10461 for(Index j = 0; j < raw_columns_number; j++)
10462 {
10463 trim(tokens(j));
10464
10465 if(has_rows_labels && j ==0)
10466 {
10467 rows_labels(sample_index) = tokens(j);
10468 continue;
10469 }
10470 else if(columns(column_index).type == ColumnType::Numeric)
10471 {
10472 if(tokens(j) == missing_values_label || tokens(j).empty())
10473 {
10474 data(sample_index, variable_index) = static_cast<type>(NAN);
10475 variable_index++;
10476 }
10477 else
10478 {
10479 try
10480 {
10481 data(sample_index, variable_index) = static_cast<type>(stod(tokens(j)));
10482 variable_index++;
10483 }
10484 catch(invalid_argument)
10485 {
10486 ostringstream buffer;
10487
10488 buffer << "OpenNN Exception: DataSet class.\n"
10489 << "void read_csv_3_complete() method.\n"
10490 << "Sample " << sample_index << "; Invalid number: " << tokens(j) << "\n";
10491
10492 throw logic_error(buffer.str());
10493 }
10494 }
10495 }
10496 else if(columns(column_index).type == ColumnType::DateTime)
10497 {
10498 if(tokens(j) == missing_values_label || tokens(j).empty())
10499 {
10500 data(sample_index, variable_index) = static_cast<type>(NAN);
10501 variable_index++;
10502 }
10503 else
10504 {
10505 data(sample_index, variable_index) = static_cast<type>(date_to_timestamp(tokens(j), gmt));
10506
10507 variable_index++;
10508 }
10509 }
10510 else if(columns(column_index).type == ColumnType::Categorical)
10511 {
10512 for(Index k = 0; k < columns(column_index).get_categories_number(); k++)
10513 {
10514 if(tokens(j) == missing_values_label)
10515 {
10516 data(sample_index, variable_index) = static_cast<type>(NAN);
10517 }
10518 else if(tokens(j) == columns(column_index).categories(k))
10519 {
10520 data(sample_index, variable_index) = type(1);
10521 }
10522
10523 variable_index++;
10524 }
10525 }
10526 else if(columns(column_index).type == ColumnType::Binary)
10527 {
10528 if(tokens(j) == missing_values_label)
10529 {
10530 data(sample_index, variable_index) = static_cast<type>(NAN);
10531 }
10532 else if(columns(column_index).categories.size() > 0 && tokens(j) == columns(column_index).categories(0))
10533 {
10534 data(sample_index, variable_index) = type(1);
10535 }
10536 else if(tokens(j) == columns(column_index).name)
10537 {
10538 data(sample_index, variable_index) = type(1);
10539 }
10540
10541 variable_index++;
10542 }
10543
10544 column_index++;
10545 }
10546
10547 sample_index++;
10548 }
10549
10550 const Index data_file_preview_index = has_columns_names ? 3 : 2;
10551
10552 data_file_preview(data_file_preview_index) = tokens;
10553
10554 if(display) cout << "Data read succesfully..." << endl;
10555
10556 file.close();
10557
10558 // Check Constant and DateTime to unused
10559
10560 check_constant_columns();
10561
10562 // Check binary
10563
10564 if(display) cout << "Checking binary columns..." << endl;
10565
10566 set_binary_simple_columns();
10567
10568}
10569
10570
10571void DataSet::check_separators(const string& line) const
10572{
10573 if(line.find(',') == string::npos
10574 && line.find(';') == string::npos
10575 && line.find(' ') == string::npos
10576 && line.find('\t') == string::npos) return;
10577
10578 const char separator_char = get_separator_char();
10579
10580 if(line.find(separator_char) == string::npos)
10581 {
10582 const string message =
10583 "Error: " + get_separator_string() + " separator not found in line data file " + data_file_name + ".\n"
10584 "Line: '" + line + "'";
10585
10586 throw logic_error(message);
10587 }
10588
10589 if(separator == Separator::Space)
10590 {
10591 if(line.find(',') != string::npos)
10592 {
10593 const string message =
10594 "Error: Found comma (',') in data file " + data_file_name + ", but separator is space (' ').";
10595
10596 throw logic_error(message);
10597 }
10598 if(line.find(';') != string::npos)
10599 {
10600 const string message =
10601 "Error: Found semicolon (';') in data file " + data_file_name + ", but separator is space (' ').";
10602
10603 throw logic_error(message);
10604 }
10605 }
10606 else if(separator == Separator::Tab)
10607 {
10608 if(line.find(',') != string::npos)
10609 {
10610 const string message =
10611 "Error: Found comma (',') in data file " + data_file_name + ", but separator is tab (' ').";
10612
10613 throw logic_error(message);
10614 }
10615 if(line.find(';') != string::npos)
10616 {
10617 const string message =
10618 "Error: Found semicolon (';') in data file " + data_file_name + ", but separator is tab (' ').";
10619
10620 throw logic_error(message);
10621 }
10622 }
10623 else if(separator == Separator::Comma)
10624 {
10625 if(line.find(";") != string::npos)
10626 {
10627 const string message =
10628 "Error: Found semicolon (';') in data file " + data_file_name + ", but separator is comma (',').";
10629
10630 throw logic_error(message);
10631 }
10632 }
10633 else if(separator == Separator::Semicolon)
10634 {
10635 if(line.find(",") != string::npos)
10636 {
10637 const string message =
10638 "Error: Found comma (',') in data file " + data_file_name + ", but separator is semicolon (';'). " + line;
10639
10640 throw logic_error(message);
10641 }
10642 }
10643}
10644
10645
10646void DataSet::check_special_characters(const string & line) const
10647{
10648 if(line.find_first_of("|@#~€¬^*") != string::npos)
10649 {
10650 const string message =
10651 "Error: found special characters in line: " + line + ". Please, review the file.";
10652
10653 throw logic_error(message);
10654 }
10655
10656//#ifdef __unix__
10657// if(line.find("\r") != string::npos)
10658// {
10659// const string message =
10660// "Error: mixed break line characters in line: " + line + ". Please, review the file.";
10661// throw logic_error(message);
10662// }
10663//#endif
10664
10665}
10666
10667
10668bool DataSet::has_binary_columns() const
10669{
10670 const Index variables_number = columns.size();
10671
10672 for(Index i = 0; i < variables_number; i++)
10673 {
10674 if(columns(i).type == ColumnType::Binary) return true;
10675 }
10676
10677 return false;
10678}
10679
10680
10681bool DataSet::has_categorical_columns() const
10682{
10683 const Index variables_number = columns.size();
10684
10685 for(Index i = 0; i < variables_number; i++)
10686 {
10687 if(columns(i).type == ColumnType::Categorical) return true;
10688 }
10689
10690 return false;
10691}
10692
10693
10694bool DataSet::has_time_columns() const
10695{
10696 const Index columns_number = columns.size();
10697
10698 for(Index i = 0; i < columns_number; i++)
10699 {
10700 if(columns(i).type == ColumnType::DateTime) return true;
10701 }
10702
10703 return false;
10704}
10705
10706
10707bool DataSet::has_time_time_series_columns() const
10708{
10709 const Index time_series_columns_number = time_series_columns.size();
10710
10711 for(Index i = 0; i < time_series_columns_number; i++)
10712 {
10713 if(time_series_columns(i).type == ColumnType::DateTime) return true;
10714 }
10715
10716 return false;
10717}
10718
10719
10720
10721bool DataSet::has_selection() const
10722{
10723 if(get_selection_samples_number() == 0) return false;
10724
10725 return true;
10726}
10727
10728
10729Tensor<Index, 1> DataSet::count_nan_columns() const
10730{
10731 const Index columns_number = get_columns_number();
10732 const Index rows_number = get_samples_number();
10733
10734 Tensor<Index, 1> nan_columns(get_columns_number());
10735 nan_columns.setZero();
10736
10737 for(Index column_index = 0; column_index < columns_number; column_index++)
10738 {
10739 const Index current_variable_index = get_variable_indices(column_index)(0);
10740
10741 for(Index row_index = 0; row_index < rows_number; row_index++)
10742 {
10743 if(isnan(data(row_index,current_variable_index)))
10744 {
10745 nan_columns(column_index)++;
10746 }
10747 }
10748 }
10749
10750 return nan_columns;
10751}
10752
10753
10754Index DataSet::count_rows_with_nan() const
10755{
10756 Index rows_with_nan = 0;
10757
10758 const Index rows_number = data.dimension(0);
10759 const Index columns_number = data.dimension(1);
10760
10761 bool has_nan = true;
10762
10763 for(Index row_index = 0; row_index < rows_number; row_index++)
10764 {
10765 has_nan = false;
10766
10767 for(Index column_index = 0; column_index < columns_number; column_index++)
10768 {
10769 if(isnan(data(row_index, column_index)))
10770 {
10771 has_nan = true;
10772 break;
10773 }
10774 }
10775
10776 if(has_nan) rows_with_nan++;
10777 }
10778
10779 return rows_with_nan;
10780}
10781
10782
10783Index DataSet::count_nan() const
10784{
10785 const Index rows_number = data.dimension(0);
10786 const Index columns_number = data.dimension(1);
10787
10788 Index count = 0;
10789
10790 #pragma omp parallel for reduction(+: count)
10791
10792 for(Index row_index = 0; row_index < rows_number; row_index++)
10793 {
10794 for(Index column_index = 0; column_index < columns_number; column_index++)
10795 {
10796 if(isnan(data(row_index, column_index))) count++;
10797 }
10798 }
10799
10800 return count;
10801}
10802
10803
10804void DataSet::set_missing_values_number(const Index& new_missing_values_number)
10805{
10806 missing_values_number = new_missing_values_number;
10807}
10808
10809
10810void DataSet::set_missing_values_number()
10811{
10812 missing_values_number = count_nan();
10813}
10814
10815
10816void DataSet::set_columns_missing_values_number(const Tensor<Index, 1>& new_columns_missing_values_number)
10817{
10818 columns_missing_values_number = new_columns_missing_values_number;
10819}
10820
10821
10822void DataSet::set_columns_missing_values_number()
10823{
10824 columns_missing_values_number = count_nan_columns();
10825}
10826
10827
10828void DataSet::set_rows_missing_values_number(const Index& new_rows_missing_values_number)
10829{
10830 rows_missing_values_number = new_rows_missing_values_number;
10831}
10832
10833
10834void DataSet::set_rows_missing_values_number()
10835{
10836 rows_missing_values_number = count_rows_with_nan();
10837}
10838
10839
10840void DataSet::fix_repeated_names()
10841{
10842 // Fix columns names
10843
10844 const Index columns_number = columns.size();
10845
10846 map<string, Index> columns_count_map;
10847
10848 for(Index i = 0; i < columns_number; i++)
10849 {
10850 auto result = columns_count_map.insert(pair<string, Index>(columns(i).name, 1));
10851
10852 if(!result.second) result.first->second++;
10853 }
10854
10855 for(auto & element : columns_count_map)
10856 {
10857 if(element.second > 1)
10858 {
10859 const string repeated_name = element.first;
10860 Index repeated_index = 1;
10861
10862 for(Index i = 0; i < columns.size(); i++)
10863 {
10864 if(columns(i).name == repeated_name)
10865 {
10866 columns(i).name = columns(i).name + "_" + to_string(repeated_index);
10867 repeated_index++;
10868 }
10869 }
10870 }
10871 }
10872
10873 // Fix variables names
10874
10875 if(has_categorical_columns() || has_binary_columns())
10876 {
10877 Tensor<string, 1> variables_names = get_variables_names();
10878
10879 const Index variables_number = variables_names.size();
10880
10881 map<string, Index> variables_count_map;
10882
10883 for(Index i = 0; i < variables_number; i++)
10884 {
10885 auto result = variables_count_map.insert(pair<string, Index>(variables_names(i), 1));
10886
10887 if(!result.second) result.first->second++;
10888 }
10889
10890 for(auto & element : variables_count_map)
10891 {
10892 if(element.second > 1)
10893 {
10894 const string repeated_name = element.first;
10895
10896 for(Index i = 0; i < variables_number; i++)
10897 {
10898 if(variables_names(i) == repeated_name)
10899 {
10900 const Index column_index = get_column_index(i);
10901
10902 if(columns(column_index).type != ColumnType::Categorical) continue;
10903
10904 variables_names(i) = variables_names(i) + "_" + columns(column_index).name;
10905 }
10906 }
10907 }
10908 }
10909
10910 set_variables_names(variables_names);
10911 }
10912}
10913
10914
10915Tensor<Index, 1> DataSet::push_back(const Tensor<Index, 1>& old_vector, const Index& new_string) const
10916{
10917 const Index old_size = old_vector.size();
10918
10919 const Index new_size = old_size+1;
10920
10921 Tensor<Index, 1> new_vector(new_size);
10922
10923 for(Index i = 0; i < old_size; i++) new_vector(i) = old_vector(i);
10924
10925 new_vector(new_size-1) = new_string;
10926
10927 return new_vector;
10928}
10929
10930
10931Tensor<string, 1> DataSet::push_back(const Tensor<string, 1>& old_vector, const string& new_string) const
10932{
10933 const Index old_size = old_vector.size();
10934
10935 const Index new_size = old_size+1;
10936
10937 Tensor<string, 1> new_vector(new_size);
10938
10939 for(Index i = 0; i < old_size; i++) new_vector(i) = old_vector(i);
10940
10941 new_vector(new_size-1) = new_string;
10942
10943 return new_vector;
10944}
10945
10946
10947void DataSet::initialize_sequential(Tensor<Index, 1>& new_tensor,
10948 const Index& start, const Index& step, const Index& end) const
10949{
10950 const Index new_size = (end-start)/step+1;
10951
10952 new_tensor.resize(new_size);
10953 new_tensor(0) = start;
10954
10955 for(Index i = 1; i < new_size-1; i++)
10956 {
10957 new_tensor(i) = new_tensor(i-1)+step;
10958 }
10959
10960 new_tensor(new_size-1) = end;
10961}
10962
10963
10964void DataSet::intialize_sequential(Tensor<type, 1>& new_tensor,
10965 const type& start, const type& step, const type& end) const
10966{
10967 const Index new_size = Index((end-start)/type(step) + type(1));
10968
10969 new_tensor.resize(new_size);
10970 new_tensor(0) = start;
10971
10972 for(Index i = 1; i < new_size-1; i++)
10973 {
10974 new_tensor(i) = new_tensor(i-1)+step;
10975 }
10976
10977 new_tensor(new_size-1) = end;
10978}
10979
10980
10981Tensor<Index, 2> DataSet::split_samples(const Tensor<Index, 1>& samples_indices, const Index& new_batch_size) const
10982{
10983 const Index samples_number = samples_indices.dimension(0);
10984
10985 Index batches_number;
10986 Index batch_size = new_batch_size;
10987
10988 if(samples_number < batch_size)
10989 {
10990 batches_number = 1;
10991 batch_size = samples_number;
10992 }
10993 else
10994 {
10995 batches_number = samples_number / batch_size;
10996 }
10997
10998 Tensor<Index, 2> batches(batches_number, batch_size);
10999
11000 Index count = 0;
11001
11002 for(Index i = 0; i < batches_number; ++i)
11003 {
11004 for(Index j = 0; j < batch_size; ++j)
11005 {
11006 batches(i,j) = samples_indices(count);
11007
11008 count++;
11009 }
11010 }
11011
11012 return batches;
11013}
11014
11015
11016void DataSetBatch::fill(const Tensor<Index, 1>& samples,
11017 const Tensor<Index, 1>& inputs,
11018 const Tensor<Index, 1>& targets)
11019{
11020 const Tensor<type, 2>& data = data_set_pointer->get_data();
11021
11022 const Tensor<Index, 1>& input_variables_dimensions = data_set_pointer->get_input_variables_dimensions();
11023
11024 if(input_variables_dimensions.size() == 1)
11025 {
11026 fill_submatrix(data, samples, inputs, inputs_2d.data());
11027 }
11028 else if(input_variables_dimensions.size() == 4)
11029 {
11030 const Index samples_number = input_variables_dimensions(0);
11031 const Index channels_number = input_variables_dimensions(1);
11032 const Index rows_number = input_variables_dimensions(2);
11033 const Index columns_number = input_variables_dimensions(3);
11034
11035 inputs_4d.resize(samples_number, channels_number, rows_number, columns_number);
11036
11037 Index index = 0;
11038
11039 for(Index image = 0; image < samples_number; image++)
11040 {
11041 index = 0;
11042
11043 for(Index channel = 0; channel < channels_number; channel++)
11044 {
11045 for(Index row = 0; row < rows_number; row++)
11046 {
11047 for(Index column = 0; column < columns_number; column++)
11048 {
11049 inputs_4d(image, channel, row, column) = data(image, index);
11050 index++;
11051 }
11052 }
11053 }
11054 }
11055 }
11056
11057 fill_submatrix(data, samples, targets, targets_2d.data());
11058}
11059
11060
11061DataSetBatch::DataSetBatch(const Index& new_samples_number, DataSet* new_data_set_pointer)
11062{
11063 set(new_samples_number, new_data_set_pointer);
11064}
11065
11066
11067void DataSetBatch::set(const Index& new_samples_number, DataSet* new_data_set_pointer)
11068{
11069 samples_number = new_samples_number;
11070
11071 data_set_pointer = new_data_set_pointer;
11072
11073 const Index input_variables_number = data_set_pointer->get_input_variables_number();
11074 const Index target_variables_number = data_set_pointer->get_target_variables_number();
11075
11076 const Tensor<Index, 1> input_variables_dimensions = data_set_pointer->get_input_variables_dimensions();
11077
11078 if(input_variables_dimensions.rank() == 1)
11079 {
11080 inputs_2d.resize(samples_number, input_variables_number);
11081 }
11082 else if(input_variables_dimensions.rank() == 3)
11083 {
11084 const Index channels_number = input_variables_dimensions(0);
11085 const Index rows_number = input_variables_dimensions(1);
11086 const Index columns_number = input_variables_dimensions(2);
11087
11088 inputs_4d.resize(samples_number, channels_number, rows_number, columns_number);
11089 }
11090
11091 targets_2d.resize(samples_number, target_variables_number);
11092}
11093
11094
11095Index DataSetBatch::get_samples_number() const
11096{
11097 return samples_number;
11098}
11099
11100
11101void DataSetBatch::print() const
11102{
11103 cout << "Batch structure" << endl;
11104
11105 cout << "Inputs:" << endl;
11106 cout << inputs_2d << endl;
11107
11108 cout << "Targets:" << endl;
11109 cout << targets_2d << endl;
11110}
11111
11112
11113void DataSet::shuffle()
11114{
11115 random_device rng;
11116 mt19937 urng(rng());
11117
11118 const Index data_rows = data.dimension(0);
11119 const Index data_columns = data.dimension(1);
11120
11121 Tensor<Index, 1> indices(data_rows);
11122
11123 for(Index i = 0; i < data_rows; i++) indices(i) = i;
11124
11125 std::shuffle(&indices(0), &indices(data_rows-1), urng);
11126
11127 Tensor<type, 2> new_data(data_rows, data_columns);
11128 Tensor<string, 1> new_rows_labels(data_rows);
11129
11130 Index index = 0;
11131
11132 for(Index i = 0; i < data_rows; i++)
11133 {
11134 index = indices(i);
11135
11136 new_rows_labels(i) = rows_labels(index);
11137
11138 for(Index j = 0; j < data_columns; j++)
11139 {
11140 new_data(i,j) = data(index,j);
11141 }
11142 }
11143
11144 data = new_data;
11145 rows_labels = new_rows_labels;
11146}
11147
11148
11149bool DataSet::get_has_rows_labels() const
11150{
11151 return this->has_rows_labels;
11152}
11153
11154}
11155
11156
11157// OpenNN: Open Neural Networks Library.
11158// Copyright(C) 2005-2021 Artificial Intelligence Techniques, SL.
11159//
11160// This library is free software; you can redistribute it and/or
11161// modify it under the terms of the GNU Lesser General Public
11162// License as published by the Free Software Foundation; either
11163// version 2.1 of the License, or any later version.
11164//
11165// This library is distributed in the hope that it will be useful,
11166// but WITHOUT ANY WARRANTY; without even the implied warranty of
11167// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11168// Lesser General Public License for more details.
11169
11170// You should have received a copy of the GNU Lesser General Public
11171// License along with this library; if not, write to the Free Software
11172// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
This class represents the concept of data set for data modelling problems, such as approximation,...
Definition: data_set.h:57
Index calculate_testing_negatives(const Index &) const
Definition: data_set.cpp:5468
bool is_sample_used(const Index &) const
Definition: data_set.cpp:910
Tensor< type, 1 > calculate_input_variables_maximums() const
Returns a vector containing the maximums of the input variables.
Definition: data_set.cpp:5784
void transform_time_series_columns()
This method transforms the columns into time series for forecasting problems.
Definition: data_set.cpp:790
Tensor< type, 2 > get_target_data() const
Definition: data_set.cpp:3972
Index get_training_samples_number() const
Returns the number of samples in the data set which will be used for training.
Definition: data_set.cpp:1382
Tensor< type, 1 > calculate_used_variables_minimums() const
Returns a vector containing the maximum of the used variables.
Definition: data_set.cpp:5800
Tensor< type, 2 > calculate_autocorrelations(const Index &=10) const
Definition: data_set.cpp:9315
VariableUse get_column_use(const Index &) const
Returns a vector containing the use of the column, without taking into account the categories.
Definition: data_set.cpp:2000
Index get_target_variables_number() const
Returns the number of target variables of the data set.
Definition: data_set.cpp:2884
Tensor< Descriptives, 1 > calculate_columns_descriptives_training_samples() const
Definition: data_set.cpp:5693
void set_sample_use(const Index &, const SampleUse &)
Definition: data_set.cpp:1591
void set_training()
Sets all the samples in the data set for training.
Definition: data_set.cpp:1475
void impute_missing_values_unuse()
Sets all the samples with missing values to "Unused".
Definition: data_set.cpp:9753
Tensor< Descriptives, 1 > calculate_columns_descriptives_positive_samples() const
Calculate the descriptives of the samples with positive targets in binary classification problems.
Definition: data_set.cpp:5525
void save_data_binary(const string &) const
Saves to the data file the values of the data matrix in binary format.
Definition: data_set.cpp:7909
Tensor< string, 1 > get_target_columns_names() const
Returns a string vector which contains the names of the columns whose uses are Target.
Definition: data_set.cpp:2528
Tensor< string, 1 > get_columns_names() const
Returns a string vector that contains the names of the columns.
Definition: data_set.cpp:2473
Tensor< Descriptives, 1 > calculate_input_variables_descriptives() const
Definition: data_set.cpp:5726
void set_variable_name(const Index &, const string &)
Definition: data_set.cpp:3295
Tensor< Descriptives, 1 > scale_target_variables()
Definition: data_set.cpp:6298
Tensor< Index, 1 > calculate_local_outlier_factor_outliers(const Index &=20, const Index &=0, const type &=type(0)) const
Definition: data_set.cpp:9002
void set_input_variables_dimensions(const Tensor< Index, 1 > &)
Sets new input dimensions in the data set.
Definition: data_set.cpp:3650
Tensor< Index, 1 > get_training_samples_indices() const
Returns the indices of the samples which will be used for training.
Definition: data_set.cpp:1073
void set_input()
Sets all the variables in the data set as input variables.
Definition: data_set.cpp:3428
bool has_columns_names
Header which contains variables name.
Definition: data_set.h:784
void generate_constant_data(const Index &, const Index &, const type &)
Definition: data_set.cpp:9558
void generate_Rosenbrock_data(const Index &, const Index &)
Definition: data_set.cpp:9607
void set_data_random()
Definition: data_set.cpp:6449
Tensor< string, 1 > get_time_series_variables_names() const
Definition: data_set.cpp:2151
void print_top_input_target_columns_correlations() const
Definition: data_set.cpp:5983
void set_has_columns_names(const bool &)
Sets if the data file contains a header with the names of the columns.
Definition: data_set.cpp:4866
Tensor< Index, 1 > get_testing_samples_indices() const
Returns the indices of the samples which will be used for testing.
Definition: data_set.cpp:1123
void set_columns_number(const Index &)
Definition: data_set.cpp:3465
Tensor< Index, 1 > get_selection_samples_indices() const
Returns the indices of the samples which will be used for selection.
Definition: data_set.cpp:1098
Tensor< type, 1 > calculate_selection_targets_mean() const
Returns the mean values of the target variables on the selection.
Definition: data_set.cpp:5843
const bool & get_display() const
Definition: data_set.cpp:94
void set_selection()
Sets all the samples in the data set for selection.
Definition: data_set.cpp:1488
Tensor< Histogram, 1 > calculate_columns_distribution(const Index &=10) const
Definition: data_set.cpp:5201
const Tensor< type, 2 > & get_data() const
Definition: data_set.cpp:3673
Index get_variables_number() const
Returns the number of variables in the data set.
Definition: data_set.cpp:2814
void set_data_constant(const type &)
Definition: data_set.cpp:6440
Tensor< type, 2 > get_sample_target_data(const Index &) const
Definition: data_set.cpp:4181
void set_samples_unused()
Sets all the samples in the data set for unused.
Definition: data_set.cpp:1562
Tensor< type, 3 > calculate_cross_correlations(const Index &=10) const
Calculates the cross-correlation between all the variables in the data set.
Definition: data_set.cpp:9424
Tensor< Index, 1 > get_target_columns_indices() const
Returns a indices vector with the positions of the targets.
Definition: data_set.cpp:2315
void split_samples_sequential(const type &training_ratio=static_cast< type >(0.6), const type &selection_ratio=static_cast< type >(0.2), const type &testing_ratio=static_cast< type >(0.2))
Definition: data_set.cpp:1834
void unscale_input_variables(const Tensor< Descriptives, 1 > &)
Definition: data_set.cpp:6351
Index lags_number
Number of lags.
Definition: data_set.h:800
Tensor< Descriptives, 1 > calculate_target_variables_descriptives() const
Definition: data_set.cpp:5745
void set_target()
Sets all the variables in the data set as target variables.
Definition: data_set.cpp:3441
void impute_missing_values_mean()
Substitutes all the missing values by the mean of the corresponding variable.
Definition: data_set.cpp:9768
void set_default()
Definition: data_set.cpp:4796
string data_file_name
Data file name.
Definition: data_set.h:772
Tensor< Descriptives, 1 > calculate_variables_descriptives() const
Definition: data_set.cpp:5499
void check_input_csv(const string &, const char &) const
This method checks if the input data file has the correct format. Returns an error message.
Definition: data_set.cpp:8111
char get_separator_char() const
Returns the string which will be used as separator in the data file.
Definition: data_set.cpp:3771
const bool & get_rows_label() const
Returns true if the data file has rows label, and false otherwise.
Definition: data_set.cpp:3720
Tensor< Tensor< Index, 1 >, 1 > calculate_Tukey_outliers(const type &=type(1.5)) const
Definition: data_set.cpp:8540
void load(const string &)
Definition: data_set.cpp:7716
void load_data_binary()
This method loads the data from a binary data file.
Definition: data_set.cpp:8023
void set_samples_uses(const Tensor< SampleUse, 1 > &)
Definition: data_set.cpp:1637
string missing_values_label
Missing values label.
Definition: data_set.h:780
const Index & get_lags_number() const
Returns the number of lags to be used in a time series prediction application.
Definition: data_set.cpp:3825
void set_variables_unused()
Sets all the variables in the data set as unused variables.
Definition: data_set.cpp:3452
void load_time_series_data_binary(const string &)
This method loads time series data from a binary data.
Definition: data_set.cpp:8067
Index calculate_training_negatives(const Index &) const
Definition: data_set.cpp:5398
void set_gmt(Index &)
Definition: data_set.cpp:5865
Tensor< Index, 1 > get_used_samples_indices() const
Returns the indices of the used samples(those which are not set unused).
Definition: data_set.cpp:1148
void set_missing_values_method(const MissingValuesMethod &)
Definition: data_set.cpp:4985
Tensor< type, 1 > get_sample_data(const Index &) const
Definition: data_set.cpp:4093
Tensor< type, 2 > get_sample_input_data(const Index &) const
Definition: data_set.cpp:4163
bool display
Display messages to screen.
Definition: data_set.h:832
void impute_missing_values_median()
Substitutes all the missing values by the median of the corresponding variable.
Definition: data_set.cpp:9802
void set_separator(const Separator &)
Definition: data_set.cpp:4883
Tensor< type, 1 > calculate_input_variables_minimums() const
Returns a vector containing the minimums of the input variables.
Definition: data_set.cpp:5767
string get_sample_string(const Index &, const string &=",") const
Definition: data_set.cpp:1006
Index get_time_series_columns_number() const
Returns the number of columns in the time series.
Definition: data_set.cpp:2807
Index get_unused_samples_number() const
Definition: data_set.cpp:1455
virtual ~DataSet()
Destructor.
Definition: data_set.cpp:84
void print_inputs_correlations() const
Print on screen the correlation between variables in the data set.
Definition: data_set.cpp:6064
Index get_unused_variables_number() const
Returns the number of variables which will neither be used as input nor as target.
Definition: data_set.cpp:2910
Index steps_ahead
Number of steps ahead.
Definition: data_set.h:804
bool is_empty() const
Returns true if the data matrix is empty, and false otherwise.
Definition: data_set.cpp:3658
void set_lags_number(const Index &)
Definition: data_set.cpp:5022
Index get_time_columns_number() const
Returns the number of columns whose uses are Time.
Definition: data_set.cpp:2643
const Tensor< Index, 1 > & get_input_variables_dimensions() const
Returns the dimensions of the input variables.
Definition: data_set.cpp:2245
Tensor< Index, 1 > calculate_target_distribution() const
Definition: data_set.cpp:8480
void set_samples_number(const Index &)
Definition: data_set.cpp:5062
Tensor< Index, 1 > get_input_columns_indices() const
Returns a indices vector with the positions of the inputs.
Definition: data_set.cpp:2271
SampleUse get_sample_use(const Index &) const
Definition: data_set.cpp:1199
Tensor< VariableUse, 1 > get_columns_uses() const
Returns the uses of each columns of the data set.
Definition: data_set.cpp:2008
Tensor< string, 1 > get_used_columns_names() const
Returns a string vector which contains the names of the columns used whether Input,...
Definition: data_set.cpp:2551
void set()
Sets zero samples and zero variables in the data set.
Definition: data_set.cpp:4586
void set_has_rows_label(const bool &)
Sets if the data file contains rows label.
Definition: data_set.cpp:4874
Tensor< Index, 1 > get_target_variables_indices() const
Returns the indices of the target variables.
Definition: data_set.cpp:3094
Tensor< string, 1 > unuse_uncorrelated_columns(const type &=type(0.25))
Definition: data_set.cpp:5163
void set_variables_names(const Tensor< string, 1 > &)
Definition: data_set.cpp:3355
void split_samples_random(const type &training_ratio=static_cast< type >(0.6), const type &selection_ratio=static_cast< type >(0.2), const type &testing_ratio=static_cast< type >(0.2))
Definition: data_set.cpp:1726
MissingValuesMethod
Enumeration of available methods for missing values in the data.
Definition: data_set.h:85
Tensor< Index, 1 > get_samples_uses_numbers() const
Definition: data_set.cpp:943
Tensor< Index, 1 > unuse_repeated_samples()
Definition: data_set.cpp:5112
void set_testing()
Sets all the samples in the data set for testing.
Definition: data_set.cpp:1501
void print_top_inputs_correlations() const
Definition: data_set.cpp:6091
const string & get_missing_values_label() const
Returns the string which will be used as label for the missing values in the data file.
Definition: data_set.cpp:3817
string get_variable_name(const Index &) const
Definition: data_set.cpp:2059
void print_data() const
Prints to the screen the values of the data matrix.
Definition: data_set.cpp:7783
const Separator & get_separator() const
Returns the separator to be used in the data file.
Definition: data_set.cpp:3763
Tensor< type, 1 > calculate_target_variables_minimums() const
Returns a vector containing the minimums of the target variables.
Definition: data_set.cpp:5775
void set_missing_values_label(const string &)
Definition: data_set.cpp:4961
Index get_selection_samples_number() const
Returns the number of samples in the data set which will be used for selection.
Definition: data_set.cpp:1402
const Tensor< SampleUse, 1 > & get_samples_uses() const
Returns the use of every sample (training, selection, testing or unused) in a vector.
Definition: data_set.cpp:1207
Index get_variable_index(const string &name) const
Definition: data_set.cpp:2937
Tensor< type, 2 > get_selection_target_data() const
Definition: data_set.cpp:4052
void save_data() const
Saves to the data file the values of the data matrix.
Definition: data_set.cpp:7844
void save(const string &) const
Definition: data_set.cpp:7681
Tensor< type, 2 > get_testing_data() const
Definition: data_set.cpp:3938
Tensor< Descriptives, 1 > calculate_used_variables_descriptives() const
Definition: data_set.cpp:5514
MissingValuesMethod missing_values_method
Missing values method.
Definition: data_set.h:820
Tensor< type, 2 > get_training_data() const
Definition: data_set.cpp:3900
Tensor< bool, 1 > get_input_columns_binary() const
Returns the input columns of the data set.
Definition: data_set.cpp:2739
Tensor< type, 2 > get_testing_input_data() const
Definition: data_set.cpp:4066
void transform_time_series()
Arranges an input-target DataSet from a time series matrix, according to the number of lags.
Definition: data_set.cpp:8007
VariableUse get_variable_use(const Index &) const
Definition: data_set.cpp:1992
Tensor< type, 1 > calculate_variables_means(const Tensor< Index, 1 > &) const
Definition: data_set.cpp:5809
Tensor< Descriptives, 1 > calculate_columns_descriptives_selection_samples() const
Definition: data_set.cpp:5712
string get_separator_string() const
Returns the string which will be used as separator in the data file.
Definition: data_set.cpp:3794
Separator
Enumeration of available separators for the data file.
Definition: data_set.h:81
Index get_input_variables_number() const
Definition: data_set.cpp:2859
Tensor< Index, 1 > get_unused_columns_indices() const
Returns a indices vector with the positions of the unused columns.
Definition: data_set.cpp:2359
Tensor< type, 2 > read_input_csv(const string &, const char &, const string &, const bool &, const bool &) const
This method loads data from a file and returns a matrix containing the input columns.
Definition: data_set.cpp:8182
Index get_target_columns_number() const
Returns the number of columns whose uses are Target.
Definition: data_set.cpp:2609
Tensor< type, 2 > time_series_data
Definition: data_set.h:810
Tensor< Correlation, 2 > calculate_input_columns_correlations() const
Definition: data_set.cpp:6019
void generate_random_data(const Index &, const Index &)
Definition: data_set.cpp:9574
void scrub_missing_values()
Definition: data_set.cpp:9828
Index get_testing_samples_number() const
Returns the number of samples in the data set which will be used for testing.
Definition: data_set.cpp:1422
const string & get_time_column() const
Returns the indices of the time variables in the data set.
Definition: data_set.cpp:3841
Index calculate_selection_negatives(const Index &) const
Definition: data_set.cpp:5433
Index get_used_columns_number() const
Returns the number of columns that are used.
Definition: data_set.cpp:2679
bool has_nan_row(const Index &) const
Returns true if the given row contains missing values.
Definition: data_set.cpp:5922
Index get_time_series_variables_number() const
Returns the number of variables in the time series data.
Definition: data_set.cpp:2835
void print_missing_values_information() const
Definition: data_set.cpp:5940
void set_default_columns_names()
Definition: data_set.cpp:1968
Index calculate_used_negatives(const Index &) const
Definition: data_set.cpp:5363
Index get_used_samples_number() const
Definition: data_set.cpp:1443
void save_time_series_data_binary(const string &) const
Saves to the data file the values of the time series data matrix in binary format.
Definition: data_set.cpp:7958
string time_column
Index where time variable is located for forecasting applications.
Definition: data_set.h:796
Tensor< Index, 1 > get_used_columns_indices() const
Returns a indices vector with the positions of the used columns.
Definition: data_set.cpp:2383
Tensor< string, 1 > get_variables_names() const
Definition: data_set.cpp:2118
Tensor< type, 2 > get_selection_input_data() const
Definition: data_set.cpp:4038
Tensor< type, 2 > get_time_series_column_data(const Index &) const
Definition: data_set.cpp:4309
Index get_unused_columns_number() const
Returns the number of columns that are not used.
Definition: data_set.cpp:2661
const Index & get_steps_ahead() const
Returns the number of steps ahead to be used in a time series prediction application.
Definition: data_set.cpp:3833
bool has_rows_labels
Header which contains the rows label.
Definition: data_set.h:788
Tensor< Correlation, 2 > calculate_input_target_columns_correlations() const
Definition: data_set.cpp:5876
Index get_gmt() const
Definition: data_set.cpp:5856
void set_default_columns_scalers()
Definition: data_set.cpp:6128
void unscale_target_variables(const Tensor< Descriptives, 1 > &)
Definition: data_set.cpp:6397
Tensor< VariableUse, 1 > get_variables_uses() const
Definition: data_set.cpp:2026
static Scaler get_scaling_unscaling_method(const string &)
Definition: data_set.cpp:3861
void set_data_binary_random()
Definition: data_set.cpp:6458
void set_time_column(const string &)
Definition: data_set.cpp:5041
Tensor< Index, 1 > get_variable_indices(const Index &) const
Definition: data_set.cpp:4248
Tensor< type, 2 > data
Definition: data_set.h:754
Tensor< type, 1 > get_variable_data(const Index &) const
Definition: data_set.cpp:4353
Tensor< type, 2 > get_selection_data() const
Definition: data_set.cpp:3921
Tensor< type, 2 > get_column_data(const Index &) const
Definition: data_set.cpp:4289
Tensor< string, 1 > get_input_columns_names() const
Returns a string vector that contains the names of the columns whose uses are Input.
Definition: data_set.cpp:2505
void set_data_file_name(const string &)
Definition: data_set.cpp:4858
void print() const
Prints to the screen in text format the main numbers from the data set object.
Definition: data_set.cpp:7664
const bool & get_header_line() const
Returns true if the first line of the data file has a header with the names of the variables,...
Definition: data_set.cpp:3712
Index missing_values_number
Missing values.
Definition: data_set.h:824
Tensor< type, 2 > get_input_data() const
Definition: data_set.cpp:3955
void set_column_name(const Index &, const string &)
Definition: data_set.cpp:1983
void set_display(const bool &)
Definition: data_set.cpp:4785
Tensor< string, 1 > get_target_variables_names() const
Definition: data_set.cpp:2215
Tensor< Index, 1 > get_unused_samples_indices() const
Returns the indices of the samples set unused.
Definition: data_set.cpp:1173
void set_input_columns_unused()
Sets all input columns in the data_set as unused columns.
Definition: data_set.cpp:3229
void print_data_preview() const
Definition: data_set.cpp:7792
Tensor< type, 2 > get_training_input_data() const
Definition: data_set.cpp:4010
Tensor< Index, 1 > get_input_variables_indices() const
Returns the indices of the input variables.
Definition: data_set.cpp:3047
Tensor< string, 1 > get_input_variables_names() const
Definition: data_set.cpp:2184
void set_columns_unused()
Sets all columns in the data_set as unused columns.
Definition: data_set.cpp:3200
Tensor< Index, 1 > filter_data(const Tensor< type, 1 > &, const Tensor< type, 1 > &)
Definition: data_set.cpp:9662
void write_XML(tinyxml2::XMLPrinter &) const
Serializes the data set object into a XML document of the TinyXML library without keep the DOM tree i...
Definition: data_set.cpp:6486
Tensor< Descriptives, 1 > calculate_columns_descriptives_negative_samples() const
Calculate the descriptives of the samples with neagtive targets in binary classification problems.
Definition: data_set.cpp:5584
Tensor< type, 1 > get_samples_uses_percentages() const
Definition: data_set.cpp:977
void set_data(const Tensor< type, 2 > &)
Definition: data_set.cpp:4831
Tensor< BoxPlot, 1 > calculate_columns_box_plots() const
Definition: data_set.cpp:5320
void generate_sequential_data(const Index &, const Index &)
Definition: data_set.cpp:9587
Tensor< Index, 2 > get_batches(const Tensor< Index, 1 > &, const Index &, const bool &, const Index &buffer_size=100) const
Definition: data_set.cpp:1217
Separator separator
Separator character.
Definition: data_set.h:776
Index get_column_index(const string &) const
Definition: data_set.cpp:4192
void set_columns_names(const Tensor< string, 1 > &)
Definition: data_set.cpp:3403
Tensor< Column, 1 > get_input_columns() const
Returns the input columns of the data set.
Definition: data_set.cpp:2717
Tensor< string, 1 > unuse_constant_columns()
Definition: data_set.cpp:5073
bool is_sample_unused(const Index &) const
Definition: data_set.cpp:926
void set_column_use(const Index &, const VariableUse &)
Definition: data_set.cpp:3255
MissingValuesMethod get_missing_values_method() const
Returns a string with the method used.
Definition: data_set.cpp:3696
Tensor< Descriptives, 1 > calculate_columns_descriptives_categories(const Index &) const
Definition: data_set.cpp:5645
Index get_columns_number() const
Returns the number of columns in the data set.
Definition: data_set.cpp:2800
Tensor< Index, 1 > get_unused_variables_indices() const
Returns the indices of the unused variables.
Definition: data_set.cpp:2956
bool has_nan() const
Returns true if the data contain missing values.
Definition: data_set.cpp:5912
const string & get_data_file_name() const
Returns the name of the data file.
Definition: data_set.cpp:3704
void set_steps_ahead_number(const Index &)
Definition: data_set.cpp:5032
Tensor< Column, 1 > get_used_columns() const
Returns the used columns of the data set.
Definition: data_set.cpp:2781
void set_default_columns_uses()
Definition: data_set.cpp:1921
const Tensor< type, 2 > & get_time_series_data() const
Definition: data_set.cpp:3688
void print_input_target_columns_correlations() const
Print on screen the correlation between targets and inputs.
Definition: data_set.cpp:5960
Tensor< type, 2 > get_testing_target_data() const
Definition: data_set.cpp:4080
Tensor< Descriptives, 1 > scale_input_variables()
Definition: data_set.cpp:6243
Tensor< Index, 1 > get_used_variables_indices() const
Returns the indices of the used variables.
Definition: data_set.cpp:3002
void unuse_Tukey_outliers(const type &=type(1.5))
Definition: data_set.cpp:8624
Tensor< Column, 1 > get_columns() const
Returns the columns of the data set.
Definition: data_set.cpp:2697
Tensor< type, 1 > calculate_target_variables_maximums() const
Returns a vector containing the maximums of the target variables.
Definition: data_set.cpp:5792
Index get_input_columns_number() const
Returns the number of columns whose uses are Input.
Definition: data_set.cpp:2575
Tensor< type, 2 > get_training_target_data() const
Definition: data_set.cpp:4024
Index get_used_variables_number() const
Returns the number of variables which are either input nor target.
Definition: data_set.cpp:2259
Tensor< Column, 1 > get_target_columns() const
Returns the target columns of the data set.
Definition: data_set.cpp:2759
void set_columns_uses(const Tensor< string, 1 > &)
Definition: data_set.cpp:3142
const XMLElement * NextSiblingElement(const char *name=nullptr) const
Get the next(right) sibling element of this node, with an optionally supplied name.
Definition: tinyxml2.cpp:1059
void PushText(const char *text, bool cdata=false)
Add a text node.
Definition: tinyxml2.cpp:2878
void PushAttribute(const char *name, const char *value)
If streaming, add an attribute to an open element.
Definition: tinyxml2.cpp:2783
virtual void CloseElement(bool compactMode=false)
If streaming, close the Element.
Definition: tinyxml2.cpp:2834
uint32 sqrt(uint32 &r, int &exp)
Definition: half.hpp:1480
uint32 log2(uint32 m, unsigned int n=32)
Definition: half.hpp:1531
HALF_CONSTEXPR half abs(half arg)
Definition: half.hpp:2735
half log(half arg)
Definition: half.hpp:3050
HALF_CONSTEXPR bool isnan(half arg)
Definition: half.hpp:4385
half ceil(half arg)
Definition: half.hpp:4071
half pow(half x, half y)
Definition: half.hpp:3427
Index get_categories_number() const
Returns the number of categories.
Definition: data_set.cpp:735
virtual ~Column()
Destructor.
Definition: data_set.cpp:133
void set_categories_uses(const Tensor< string, 1 > &)
Definition: data_set.cpp:294
Tensor< string, 1 > categories
Categories within the column.
Definition: data_set.h:143
void add_category(const string &)
Definition: data_set.cpp:270
string name
Column name.
Definition: data_set.h:131
VariableUse column_use
Column use.
Definition: data_set.h:135
Tensor< VariableUse, 1 > categories_uses
Categories use.
Definition: data_set.h:147
void set_use(const VariableUse &)
Definition: data_set.cpp:182
void set_type(const string &)
Definition: data_set.cpp:230
Tensor< string, 1 > get_used_variables_names() const
Returns a string vector that contains the names of the used variables in the data set.
Definition: data_set.cpp:758
Index get_used_categories_number() const
Returns the number of used categories.
Definition: data_set.cpp:743
Column()
Default constructor.
Definition: data_set.cpp:102
DataSetBatch()
Default constructor.
Definition: data_set.h:890