11using namespace OpenNN;
49 set(new_samples_number, new_variables_number);
62DataSet::DataSet(
const Index& new_samples_number,
const Index& new_inputs_number,
const Index& new_targets_number)
64 set(new_samples_number, new_inputs_number, new_targets_number);
76DataSet::DataSet(
const string& data_file_name,
const char& separator,
const bool& has_columns_names)
86 delete non_blocking_thread_pool;
87 delete thread_pool_device;
106 type = ColumnType::Numeric;
110 scaler = Scaler::MeanStandardDeviation;
119 const Scaler& new_scaler,
120 const Tensor<string, 1>& new_categories,
121 const Tensor<VariableUse, 1>& new_categories_uses)
125 column_use = new_column_use;
127 categories = new_categories;
128 categories_uses = new_categories_uses;
137void DataSet::Column::set_scaler(
const Scaler& new_scaler)
143void DataSet::Column::set_scaler(
const string& new_scaler)
145 if(new_scaler ==
"NoScaling")
147 set_scaler(Scaler::NoScaling);
149 else if(new_scaler ==
"MinimumMaximum")
151 set_scaler(Scaler::MinimumMaximum);
153 else if(new_scaler ==
"MeanStandardDeviation")
155 set_scaler(Scaler::MeanStandardDeviation);
157 else if(new_scaler ==
"StandardDeviation")
159 set_scaler(Scaler::StandardDeviation);
161 else if(new_scaler ==
"Logarithm")
163 set_scaler(Scaler::Logarithm);
167 ostringstream buffer;
169 buffer <<
"OpenNN Exception: DataSet class.\n"
170 <<
"void set_scaler(const string&) method.\n"
171 <<
"Unknown scaler: " << new_scaler <<
"\n";
173 throw logic_error(buffer.str());
184 column_use = new_column_use;
186 for(Index i = 0; i < categories_uses.size(); i++)
188 categories_uses(i) = new_column_use;
198 if(new_column_use ==
"Input")
200 set_use(VariableUse::Input);
202 else if(new_column_use ==
"Target")
204 set_use(VariableUse::Target);
206 else if(new_column_use ==
"Time")
208 set_use(VariableUse::Time);
210 else if(new_column_use ==
"Unused")
212 set_use(VariableUse::UnusedVariable);
216 ostringstream buffer;
218 buffer <<
"OpenNN Exception: DataSet class.\n"
219 <<
"void set_use(const string&) method.\n"
220 <<
"Unknown column use: " << new_column_use <<
"\n";
222 throw logic_error(buffer.str());
232 if(new_column_type ==
"Numeric")
234 type = ColumnType::Numeric;
236 else if(new_column_type ==
"Binary")
238 type = ColumnType::Binary;
240 else if(new_column_type ==
"Categorical")
242 type = ColumnType::Categorical;
244 else if(new_column_type ==
"DateTime")
246 type = ColumnType::DateTime;
248 else if(new_column_type ==
"Constant")
250 type = ColumnType::Constant;
254 ostringstream buffer;
256 buffer <<
"OpenNN Exception: DataSet class.\n"
257 <<
"void Column::set_type(const string&) method.\n"
258 <<
"Column type not valid (" << new_column_type <<
").\n";
260 throw logic_error(buffer.str());
272 const Index old_categories_number = categories.size();
274 Tensor<string, 1> old_categories = categories;
275 Tensor<VariableUse, 1> old_categories_uses = categories_uses;
277 categories.resize(old_categories_number+1);
278 categories_uses.resize(old_categories_number+1);
280 for(Index category_index = 0; category_index < old_categories_number; category_index++)
282 categories(category_index) = old_categories(category_index);
283 categories_uses(category_index) = column_use;
286 categories(old_categories_number) = new_category;
287 categories_uses(old_categories_number) = column_use;
296 const Index new_categories_uses_number = new_categories_uses.size();
298 categories_uses.resize(new_categories_uses_number);
300 for(Index i = 0; i < new_categories_uses.size(); i++)
302 if(new_categories_uses(i) ==
"Input")
304 categories_uses(i) = VariableUse::Input;
306 else if(new_categories_uses(i) ==
"Target")
308 categories_uses(i) = VariableUse::Target;
310 else if(new_categories_uses(i) ==
"Time")
312 categories_uses(i) = VariableUse::Time;
314 else if(new_categories_uses(i) ==
"Unused"
315 || new_categories_uses(i) ==
"UnusedVariable")
317 categories_uses(i) = VariableUse::UnusedVariable;
321 ostringstream buffer;
323 buffer <<
"OpenNN Exception: DataSet class.\n"
324 <<
"void Column::set_categories_uses(const Tensor<string, 1>&) method.\n"
325 <<
"Category use not valid (" << new_categories_uses(i) <<
").\n";
327 throw logic_error(buffer.str());
338 categories_uses.setConstant(new_categories_use);
344 ostringstream buffer;
352 buffer <<
"OpenNN Exception: DataSet class.\n"
353 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
354 <<
"Name element is nullptr.\n";
356 throw logic_error(buffer.str());
359 if(name_element->GetText())
361 const string new_name = name_element->GetText();
372 buffer <<
"OpenNN Exception: DataSet class.\n"
373 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
374 <<
"Scaler element is nullptr.\n";
376 throw logic_error(buffer.str());
379 if(scaler_element->GetText())
381 const string new_scaler = scaler_element->GetText();
383 set_scaler(new_scaler);
388 const tinyxml2::XMLElement* column_use_element = column_document.FirstChildElement(
"ColumnUse");
390 if(!column_use_element)
392 buffer <<
"OpenNN Exception: DataSet class.\n"
393 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
394 <<
"Column use element is nullptr.\n";
396 throw logic_error(buffer.str());
399 if(column_use_element->GetText())
401 const string new_column_use = column_use_element->GetText();
403 set_use(new_column_use);
412 buffer <<
"OpenNN Exception: DataSet class.\n"
413 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
414 <<
"Type element is nullptr.\n";
416 throw logic_error(buffer.str());
419 if(type_element->GetText())
421 const string new_type = type_element->GetText();
425 if(type == ColumnType::Categorical)
429 const tinyxml2::XMLElement* categories_element = column_document.FirstChildElement(
"Categories");
431 if(!categories_element)
433 buffer <<
"OpenNN Exception: DataSet class.\n"
434 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
435 <<
"Categories element is nullptr.\n";
437 throw logic_error(buffer.str());
440 if(categories_element->GetText())
442 const string new_categories = categories_element->GetText();
444 categories = get_tokens(new_categories,
';');
449 const tinyxml2::XMLElement* categories_uses_element = column_document.FirstChildElement(
"CategoriesUses");
451 if(!categories_uses_element)
453 buffer <<
"OpenNN Exception: DataSet class.\n"
454 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
455 <<
"Categories uses element is nullptr.\n";
457 throw logic_error(buffer.str());
460 if(categories_uses_element->GetText())
462 const string new_categories_uses = categories_uses_element->GetText();
464 set_categories_uses(get_tokens(new_categories_uses,
';'));
474 file_stream.OpenElement(
"Name");
482 file_stream.OpenElement(
"Scaler");
484 if(scaler == Scaler::NoScaling)
488 else if(scaler == Scaler::MinimumMaximum)
490 file_stream.
PushText(
"MinimumMaximum");
492 else if(scaler == Scaler::MeanStandardDeviation)
494 file_stream.
PushText(
"MeanStandardDeviation");
496 else if(scaler == Scaler::StandardDeviation)
498 file_stream.
PushText(
"StandardDeviation");
500 else if(scaler == Scaler::Logarithm)
509 file_stream.OpenElement(
"ColumnUse");
511 if(column_use == VariableUse::Input)
515 else if(column_use == VariableUse::Target)
519 else if(column_use == VariableUse::UnusedVariable)
532 file_stream.OpenElement(
"Type");
534 if(type == ColumnType::Numeric)
538 else if(type == ColumnType::Binary)
542 else if(type == ColumnType::Categorical)
544 file_stream.
PushText(
"Categorical");
546 else if(type == ColumnType::Constant)
557 if(type == ColumnType::Categorical || type == ColumnType::Binary)
559 if(categories.size() == 0)
return;
563 file_stream.OpenElement(
"Categories");
565 for(Index i = 0; i < categories.size(); i++)
567 file_stream.
PushText(categories(i).c_str());
569 if(i != categories.size()-1)
579 file_stream.OpenElement(
"CategoriesUses");
581 for(Index i = 0; i < categories_uses.size(); i++)
583 if(categories_uses(i) == VariableUse::Input)
587 else if(categories_uses(i) == VariableUse::Target)
591 else if(categories_uses(i) == VariableUse::Time)
600 if(i != categories_uses.size()-1)
611void DataSet::Column::print()
const
613 cout <<
"Name: " << name << endl;
615 cout <<
"Column use: ";
619 case VariableUse::Input:
621 cout <<
"Input" << endl;
625 case VariableUse::Target:
627 cout <<
"Target" << endl;
632 case VariableUse::UnusedVariable:
634 cout <<
"Unused" << endl;
639 case VariableUse::Time:
641 cout <<
"Time" << endl;
646 case VariableUse::Id:
648 cout <<
"Id" << endl;
653 cout <<
"Column type: ";
657 case ColumnType::Numeric:
659 cout <<
"Numeric" << endl;
663 case ColumnType::Binary:
665 cout <<
"Binary" << endl;
667 cout <<
"Categories: " << categories << endl;
671 case ColumnType::Categorical:
673 cout <<
"Categorical" << endl;
675 cout <<
"Categories: " << categories << endl;
679 case ColumnType::DateTime:
681 cout <<
"DateTime" << endl;
686 case ColumnType::Constant:
688 cout <<
"Constant" << endl;
697 case Scaler::NoScaling:
698 cout <<
"NoScaling" << endl;
701 case Scaler::MinimumMaximum:
702 cout <<
"MinimumMaximum" << endl;
705 case Scaler::MeanStandardDeviation:
706 cout <<
"MeanStandardDeviation" << endl;
709 case Scaler::StandardDeviation:
710 cout <<
"StandardDeviation" << endl;
713 case Scaler::Logarithm:
714 cout <<
"Logarithm" << endl;
720Index DataSet::Column::get_variables_number()
const
722 if(type == ColumnType::Categorical)
724 return categories.size();
737 return categories.size();
745 Index used_categories_number = 0;
747 for(Index i = 0; i < categories.size(); i++)
749 if(categories_uses(i) != VariableUse::UnusedVariable) used_categories_number++;
752 return used_categories_number;
760 Tensor<string, 1> used_variables_names;
762 if(type != ColumnType::Categorical && column_use != VariableUse::UnusedVariable)
764 used_variables_names.resize(1);
765 used_variables_names.setConstant(name);
767 else if(type == ColumnType::Categorical)
769 used_variables_names.resize(get_used_categories_number());
771 Index category_index = 0;
773 for(Index i = 0; i < categories.size(); i++)
775 if(categories_uses(i) != VariableUse::UnusedVariable)
777 used_variables_names(category_index) = categories(i);
784 return used_variables_names;
794 time_series_columns = columns;
798 Tensor<Column, 1> new_columns;
800 if(has_time_columns())
812 Index ahead_index = 0;
813 Index column_index = 0;
814 Index new_column_index = 0;
818 column_index = i%columns_number;
820 if(time_series_columns(column_index).type == ColumnType::DateTime)
continue;
824 new_columns(new_column_index).name = columns(column_index).name +
"_lag_" + to_string(lag_index);
825 new_columns(new_column_index).set_use(VariableUse::Input);
827 new_columns(new_column_index).type = columns(column_index).type;
828 new_columns(new_column_index).categories = columns(column_index).categories;
829 new_columns(new_column_index).categories_uses = columns(column_index).categories_uses;
835 new_columns(new_column_index).name = columns(column_index).name +
"_ahead_" + to_string(ahead_index);
837 new_columns(new_column_index).type = columns(column_index).type;
838 new_columns(new_column_index).categories = columns(column_index).categories;
840 if(new_columns(new_column_index).type == ColumnType::Constant)
842 new_columns(new_column_index).set_use(VariableUse::UnusedVariable);
843 new_columns(new_column_index).categories_uses.resize(columns(column_index).get_categories_number());
844 new_columns(new_column_index).categories_uses.setConstant(VariableUse::UnusedVariable);
848 new_columns(new_column_index).set_use(VariableUse::Target);
849 new_columns(new_column_index).categories_uses.resize(columns(column_index).get_categories_number());
850 new_columns(new_column_index).categories_uses.setConstant(VariableUse::Target);
856 if(lag_index > 0 && column_index == columns_number - 1)
860 else if(column_index == columns_number - 1)
866 columns = new_columns;
870void DataSet::transform_time_series_data()
874 const Index old_samples_number =
data.dimension(0);
875 const Index old_variables_number =
data.dimension(1);
882 data.resize(new_samples_number, new_variables_number);
886 for(Index j = 0; j < old_variables_number; j++)
896 memcpy(
data.data() + i*(old_variables_number-index)*new_samples_number + (j-index)*new_samples_number,
902 samples_uses.resize(new_samples_number);
912 if(samples_uses(index) == SampleUse::UnusedSample)
928 if(samples_uses(index) == SampleUse::UnusedSample)
945 Tensor<Index, 1> count(4);
947 const Index samples_number = get_samples_number();
949 for(Index i = 0; i < samples_number; i++)
951 if(samples_uses(i) == SampleUse::Training)
955 else if(samples_uses(i) == SampleUse::Selection)
959 else if(samples_uses(i) == SampleUse::Testing)
979 const Index samples_number = get_samples_number();
985 const type training_samples_percentage = type(training_samples_number*100)/
static_cast<type
>(samples_number);
986 const type selection_samples_percentage = type(selection_samples_number*100)/
static_cast<type
>(samples_number);
987 const type testing_samples_percentage = type(testing_samples_number*100)/
static_cast<type
>(samples_number);
988 const type unused_samples_percentage = type(unused_samples_number*100)/
static_cast<type
>(samples_number);
990 Tensor<type, 1> samples_uses_percentage(4);
992 samples_uses_percentage.setValues({training_samples_percentage,
993 selection_samples_percentage,
994 testing_samples_percentage,
995 unused_samples_percentage});
997 return samples_uses_percentage;
1008 const Tensor<type, 1> sample =
data.chip(sample_index, 0);
1010 string sample_string =
"";
1014 Index variable_index = 0;
1016 for(Index i = 0; i < columns_number; i++)
1018 if(columns(i).type == ColumnType::Numeric)
1021 else sample_string += to_string(
double(
data(sample_index, variable_index)));
1025 else if(columns(i).type == ColumnType::Binary)
1028 else sample_string += columns(i).categories(
static_cast<Index
>(
data(sample_index, variable_index)));
1032 else if(columns(i).type == ColumnType::DateTime)
1037 else sample_string += to_string(
double(
data(sample_index, variable_index)));
1041 else if(columns(i).type == ColumnType::Categorical)
1043 if(
isnan(
data(sample_index, variable_index)))
1049 const Index categories_number = columns(i).get_categories_number();
1051 for(Index j = 0; j < categories_number; j++)
1053 if(
abs(
data(sample_index, variable_index+j) -
static_cast<type
>(1)) < type(NUMERIC_LIMITS_MIN))
1055 sample_string += columns(i).categories(j);
1060 variable_index += categories_number;
1064 if(i != columns_number-1) sample_string +=
separator +
" ";
1067 return sample_string;
1075 const Index samples_number = get_samples_number();
1079 Tensor<Index, 1> training_indices(training_samples_number);
1083 for(Index i = 0; i < samples_number; i++)
1085 if(samples_uses(i) == SampleUse::Training)
1087 training_indices(count) = i;
1092 return training_indices;
1100 const Index samples_number = get_samples_number();
1104 Tensor<Index, 1> selection_indices(selection_samples_number);
1108 for(Index i = 0; i < samples_number; i++)
1110 if(samples_uses(i) == SampleUse::Selection)
1112 selection_indices(count) = i;
1117 return selection_indices;
1125 const Index samples_number = get_samples_number();
1129 Tensor<Index, 1> testing_indices(testing_samples_number);
1133 for(Index i = 0; i < samples_number; i++)
1135 if(samples_uses(i) == SampleUse::Testing)
1137 testing_indices(count) = i;
1142 return testing_indices;
1150 const Index samples_number = get_samples_number();
1154 Tensor<Index, 1> used_indices(used_samples_number);
1158 for(Index i = 0; i < samples_number; i++)
1160 if(samples_uses(i) != SampleUse::UnusedSample)
1162 used_indices(index) = i;
1167 return used_indices;
1175 const Index samples_number = get_samples_number();
1179 Tensor<Index, 1> unused_indices(unused_samples_number);
1183 for(Index i = 0; i < samples_number; i++)
1185 if(samples_uses(i) == SampleUse::UnusedSample)
1187 unused_indices(count) = i;
1192 return unused_indices;
1201 return samples_uses(index);
1209 return samples_uses;
1218 const Index& batch_samples_number,
1219 const bool& shuffle,
1220 const Index& new_buffer_size)
const
1222 if(!shuffle)
return split_samples(samples_indices, batch_samples_number);
1224 std::random_device rng;
1225 std::mt19937 urng(rng());
1227 const Index samples_number = samples_indices.size();
1229 Index buffer_size = new_buffer_size;
1230 Index batches_number;
1231 Index batch_size = batch_samples_number;
1235 if(buffer_size > samples_number)
1237 buffer_size = samples_number;
1242 if(samples_number < batch_size)
1245 batch_size = samples_number;
1246 buffer_size = batch_size;
1248 Tensor<Index,1> samples_copy(samples_indices);
1250 Tensor<Index, 2> batches(batches_number, batch_size);
1254 std::shuffle(samples_copy.data(), samples_copy.data() + samples_copy.size(), urng);
1256 for(Index i = 0; i > batch_size; i++)
1258 batches(0,i) = samples_copy(i);
1266 batches_number = samples_number / batch_size;
1269 Tensor<Index, 2> batches(batches_number, batch_size);
1271 Tensor<Index, 1> buffer(buffer_size);
1273 for(Index i = 0; i < buffer_size; i++) buffer(i) = i;
1275 Index next_index = buffer_size;
1276 Index random_index = 0;
1280 if(batch_size < buffer_size)
1282 Index diff = buffer_size/ batch_size;
1286 for(Index i = 0; i < batches_number; i++)
1290 if(i == batches_number-diff)
1292 Index buffer_index = 0;
1294 for(Index k = batches_number-diff; k < batches_number; k++)
1296 for(Index j = 0; j < batch_size; j++)
1298 batches(k,j) = buffer(buffer_index);
1309 for(Index j = 0; j < batch_size; j++)
1311 random_index =
static_cast<Index
>(rand()%buffer_size);
1313 batches(i, j) = buffer(random_index);
1315 buffer(random_index) = samples_indices(next_index);
1327 for(Index i = 0; i < batches_number; i++)
1331 if(i == batches_number-1)
1333 std::shuffle(buffer.data(), buffer.data() + buffer.size(), urng);
1335 if(batch_size <= buffer_size)
1337 for(Index j = 0; j < batch_size;j++)
1339 batches(i,j) = buffer(j);
1344 for(Index j = 0; j < buffer_size; j++)
1346 batches(i,j) = buffer(j);
1349 for(Index j = buffer_size; j < batch_size; j++)
1351 batches(i,j) = samples_indices(next_index);
1362 for(Index j = 0; j < batch_size; j++)
1364 random_index =
static_cast<Index
>(rand()%buffer_size);
1366 batches(i, j) = buffer(random_index);
1368 buffer(random_index) = samples_indices(next_index);
1384 const Index samples_number = get_samples_number();
1386 Index training_samples_number = 0;
1388 for(Index i = 0; i < samples_number; i++)
1390 if(samples_uses(i) == SampleUse::Training)
1392 training_samples_number++;
1396 return training_samples_number;
1404 const Index samples_number = get_samples_number();
1406 Index selection_samples_number = 0;
1408 for(Index i = 0; i < samples_number; i++)
1410 if(samples_uses(i) == SampleUse::Selection)
1412 selection_samples_number++;
1416 return selection_samples_number;
1424 const Index samples_number = get_samples_number();
1426 Index testing_samples_number = 0;
1428 for(Index i = 0; i < samples_number; i++)
1430 if(samples_uses(i) == SampleUse::Testing)
1432 testing_samples_number++;
1436 return testing_samples_number;
1445 const Index samples_number = get_samples_number();
1448 return (samples_number - unused_samples_number);
1457 const Index samples_number = get_samples_number();
1459 Index unused_samples_number = 0;
1461 for(Index i = 0; i < samples_number; i++)
1463 if(samples_uses(i) == SampleUse::UnusedSample)
1465 unused_samples_number++;
1469 return unused_samples_number;
1477 const Index samples_number = get_samples_number();
1479 for(Index i = 0; i < samples_number; i++)
1481 samples_uses(i) = SampleUse::Training;
1490 const Index samples_number = get_samples_number();
1492 for(Index i = 0; i < samples_number; i++)
1494 samples_uses(i) = SampleUse::Selection;
1503 const Index samples_number = get_samples_number();
1505 for(Index i = 0; i < samples_number; i++)
1507 samples_uses(i) = SampleUse::Testing;
1519 for(Index i = 0; i < indices.size(); i++)
1523 samples_uses(index) = SampleUse::Training;
1535 for(Index i = 0; i < indices.size(); i++)
1539 samples_uses(index) = SampleUse::Selection;
1551 for(Index i = 0; i < indices.size(); i++)
1555 samples_uses(index) = SampleUse::Testing;
1564 const Index samples_number = get_samples_number();
1566 for(Index i = 0; i < samples_number; i++)
1568 samples_uses(i) = SampleUse::UnusedSample;
1578 for(Index i = 0; i < static_cast<Index>(indices.size()); i++)
1580 const Index index = indices(i);
1582 samples_uses(index) = SampleUse::UnusedSample;
1593 samples_uses(index) = new_use;
1604 if(new_use ==
"Training")
1606 samples_uses(index) = SampleUse::Training;
1608 else if(new_use ==
"Selection")
1610 samples_uses(index) = SampleUse::Selection;
1612 else if(new_use ==
"Testing")
1614 samples_uses(index) = SampleUse::Testing;
1616 else if(new_use ==
"Unused")
1618 samples_uses(index) = SampleUse::UnusedSample;
1622 ostringstream buffer;
1624 buffer <<
"OpenNN Exception: DataSet class.\n"
1625 <<
"void set_sample_use(const string&) method.\n"
1626 <<
"Unknown sample use: " << new_use <<
"\n";
1628 throw logic_error(buffer.str());
1639 const Index samples_number = get_samples_number();
1643 const Index new_uses_size = new_uses.size();
1645 if(new_uses_size != samples_number)
1647 ostringstream buffer;
1649 buffer <<
"OpenNN Exception: DataSet class.\n"
1650 <<
"void set_samples_uses(const Tensor<SampleUse, 1>&) method.\n"
1651 <<
"Size of uses(" << new_uses_size <<
") must be equal to number of samples(" << samples_number <<
").\n";
1653 throw logic_error(buffer.str());
1658 for(Index i = 0; i < samples_number; i++)
1660 samples_uses(i) = new_uses(i);
1672 const Index samples_number = get_samples_number();
1674 ostringstream buffer;
1678 const Index new_uses_size = new_uses.size();
1680 if(new_uses_size != samples_number)
1682 buffer <<
"OpenNN Exception: DataSet class.\n"
1683 <<
"void set_samples_uses(const Tensor<string, 1>&) method.\n"
1684 <<
"Size of uses(" << new_uses_size <<
") must be equal to number of samples(" << samples_number <<
").\n";
1686 throw logic_error(buffer.str());
1691 for(Index i = 0; i < samples_number; i++)
1693 if(new_uses(i).compare(
"Training") == 0 || new_uses(i).compare(
"0") == 0)
1695 samples_uses(i) = SampleUse::Training;
1697 else if(new_uses(i).compare(
"Selection") == 0 || new_uses(i).compare(
"1") == 0)
1699 samples_uses(i) = SampleUse::Selection;
1701 else if(new_uses(i).compare(
"Testing") == 0 || new_uses(i).compare(
"2") == 0)
1703 samples_uses(i) = SampleUse::Testing;
1705 else if(new_uses(i).compare(
"Unused") == 0 || new_uses(i).compare(
"3") == 0)
1707 samples_uses(i) = SampleUse::UnusedSample;
1711 buffer <<
"OpenNN Exception: DataSet class.\n"
1712 <<
"void set_samples_uses(const Tensor<string, 1>&) method.\n"
1713 <<
"Unknown sample use: " << new_uses(i) <<
".\n";
1715 throw logic_error(buffer.str());
1727 const type& selection_samples_ratio,
1728 const type& testing_samples_ratio)
1730 std::random_device rng;
1731 std::mt19937 urng(rng());
1735 if(used_samples_number == 0)
return;
1737 const type total_ratio = training_samples_ratio + selection_samples_ratio + testing_samples_ratio;
1741 const Index selection_samples_number =
static_cast<Index
>(selection_samples_ratio*type(used_samples_number)/type(total_ratio));
1742 const Index testing_samples_number =
static_cast<Index
>(testing_samples_ratio* type(used_samples_number)/ type(total_ratio));
1743 const Index training_samples_number = used_samples_number - selection_samples_number - testing_samples_number;
1745 const Index sum_samples_number = training_samples_number + selection_samples_number + testing_samples_number;
1747 if(sum_samples_number != used_samples_number)
1749 ostringstream buffer;
1751 buffer <<
"OpenNN Warning: DataSet class.\n"
1752 <<
"void split_samples_random(const type&, const type&, const type&) method.\n"
1753 <<
"Sum of numbers of training, selection and testing samples is not equal to number of used samples.\n";
1755 throw logic_error(buffer.str());
1758 const Index samples_number = get_samples_number();
1760 Tensor<Index, 1> indices;
1762 initialize_sequential(indices, 0, 1, samples_number-1);
1764 std::shuffle(indices.data(), indices.data() + indices.size(), urng);
1768 for(Index i = 0; i < samples_uses.size(); i++)
1770 if(samples_uses(i) == SampleUse::UnusedSample) count ++;
1778 Index count_training = 0;
1780 while(count_training != training_samples_number)
1784 if(samples_uses(index) != SampleUse::UnusedSample)
1786 samples_uses(index)= SampleUse::Training;
1795 Index count_selection = 0;
1797 while(count_selection != selection_samples_number)
1801 if(samples_uses(index) != SampleUse::UnusedSample)
1803 samples_uses(index) = SampleUse::Selection;
1812 Index count_testing = 0;
1814 while(count_testing != testing_samples_number)
1818 if(samples_uses(index) != SampleUse::UnusedSample)
1820 samples_uses(index) = SampleUse::Testing;
1835 const type& selection_samples_ratio,
1836 const type& testing_samples_ratio)
1840 if(used_samples_number == 0)
return;
1842 const type total_ratio = training_samples_ratio + selection_samples_ratio + testing_samples_ratio;
1846 const Index selection_samples_number =
static_cast<Index
>(selection_samples_ratio* type(used_samples_number)/ type(total_ratio));
1847 const Index testing_samples_number =
static_cast<Index
>(testing_samples_ratio* type(used_samples_number)/ type(total_ratio));
1848 const Index training_samples_number = used_samples_number - selection_samples_number - testing_samples_number;
1850 const Index sum_samples_number = training_samples_number + selection_samples_number + testing_samples_number;
1852 if(sum_samples_number != used_samples_number)
1854 ostringstream buffer;
1856 buffer <<
"OpenNN Warning: Samples class.\n"
1857 <<
"void split_samples_sequential(const type&, const type&, const type&) method.\n"
1858 <<
"Sum of numbers of training, selection and testing samples is not equal to number of used samples.\n";
1860 throw logic_error(buffer.str());
1867 Index count_training = 0;
1869 while(count_training != training_samples_number)
1871 if(samples_uses(i) != SampleUse::UnusedSample)
1873 samples_uses(i) = SampleUse::Training;
1882 Index count_selection = 0;
1884 while(count_selection != selection_samples_number)
1886 if(samples_uses(i) != SampleUse::UnusedSample)
1888 samples_uses(i) = SampleUse::Selection;
1897 Index count_testing = 0;
1899 while(count_testing != testing_samples_number)
1901 if(samples_uses(i) != SampleUse::UnusedSample)
1903 samples_uses(i) = SampleUse::Testing;
1912void DataSet::set_columns(
const Tensor<Column, 1>& new_columns)
1914 columns = new_columns;
1923 const Index columns_number = columns.size();
1925 bool target =
false;
1927 if(columns_number == 0)
1932 else if(columns_number == 1)
1934 columns(0).set_use(VariableUse::UnusedVariable);
1941 for(Index i = columns.size()-1; i >= 0; i--)
1943 if(columns(i).type == ColumnType::Constant || columns(i).type == ColumnType::DateTime)
1945 columns(i).set_use(VariableUse::UnusedVariable);
1951 columns(i).set_use(VariableUse::Target);
1959 input_variables_dimensions.resize(1);
1970 const Index columns_number = columns.size();
1972 for(Index i = 0; i < columns_number; i++)
1974 columns(i).name =
"column_" + to_string(1+i);
1985 columns(column_index).name = new_name;
2002 return columns(index).column_use;
2012 Tensor<DataSet::VariableUse, 1> columns_uses(columns_number);
2014 for(Index i = 0; i < columns_number; i++)
2016 columns_uses(i) = columns(i).column_use;
2019 return columns_uses;
2031 Tensor<VariableUse, 1> variables_uses(variables_number);
2035 for(Index i = 0; i < columns_number; i++)
2037 if(columns(i).type == ColumnType::Categorical)
2039 for(Index i = 0; i < (columns(i).categories_uses).size(); i++)
2041 variables_uses(i + index) = (columns(i).categories_uses)(i);
2043 index += columns(i).categories.size();
2047 variables_uses(index) = columns(i).column_use;
2052 return variables_uses;
2065 if(variable_index >= variables_number)
2067 ostringstream buffer;
2069 buffer <<
"OpenNN Exception: DataSet class.\n"
2070 <<
"string& get_variable_name(const Index) method.\n"
2071 <<
"Index of variable("<<variable_index<<
") must be less than number of variables("<<variables_number<<
").\n";
2073 throw logic_error(buffer.str());
2082 for(Index i = 0; i < columns_number; i++)
2084 if(columns(i).type == ColumnType::Categorical)
2086 for(Index j = 0; j < columns(i).get_categories_number(); j++)
2088 if(index == variable_index)
2090 return columns(i).categories(j);
2100 if(index == variable_index)
2102 return columns(i).name;
2122 Tensor<string, 1> variables_names(variables_number);
2126 for(Index i = 0; i < columns.size(); i++)
2128 if(columns(i).type == ColumnType::Categorical)
2130 for(Index j = 0; j < columns(i).categories.size(); j++)
2132 variables_names(index) = columns(i).categories(j);
2139 variables_names(index) = columns(i).name;
2145 return variables_names;
2155 Tensor<string, 1> variables_names(variables_number);
2159 for(Index i = 0; i < time_series_columns.size(); i++)
2161 if(time_series_columns(i).type == ColumnType::Categorical)
2163 for(Index j = 0; j < time_series_columns(i).categories.size(); j++)
2165 variables_names(index) = time_series_columns(i).categories(j);
2172 variables_names(index) = time_series_columns(i).name;
2177 return variables_names;
2190 Tensor<string, 1> input_variables_names(input_variables_number);
2194 for(Index i = 0; i < input_columns_indices.size(); i++)
2196 Index input_index = input_columns_indices(i);
2198 const Tensor<string, 1> current_used_variables_names = columns(input_index).get_used_variables_names();
2200 for(Index j = 0; j < current_used_variables_names.size(); j++)
2202 input_variables_names(index + j) = current_used_variables_names(j);
2205 index += current_used_variables_names.size();
2208 return input_variables_names;
2221 Tensor<string, 1> target_variables_names(target_variables_number);
2225 for(Index i = 0; i < target_columns_indices.size(); i++)
2227 const Index target_index = target_columns_indices(i);
2229 const Tensor<string, 1> current_used_variables_names = columns(target_index).get_used_variables_names();
2231 for(Index j = 0; j < current_used_variables_names.size(); j++)
2233 target_variables_names(index + j) = current_used_variables_names(j);
2236 index += current_used_variables_names.size();
2239 return target_variables_names;
2247 return input_variables_dimensions;
2251Index DataSet::get_input_variables_rank()
const
2253 return input_variables_dimensions.rank();
2265 return (variables_number - unused_variables_number);
2275 Tensor<Index, 1> input_columns_indices(input_columns_number);
2279 for(Index i = 0; i < columns.size(); i++)
2281 if(columns(i).column_use == VariableUse::Input)
2283 input_columns_indices(index) = i;
2288 return input_columns_indices;
2292Tensor<Index, 1> DataSet::get_input_time_series_columns_indices()
const
2294 const Index input_columns_number = get_input_time_series_columns_number();
2296 Tensor<Index, 1> input_columns_indices(input_columns_number);
2300 for(Index i = 0; i < time_series_columns.size(); i++)
2302 if(time_series_columns(i).column_use == VariableUse::Input)
2304 input_columns_indices(index) = i;
2309 return input_columns_indices;
2319 Tensor<Index, 1> target_columns_indices(target_columns_number);
2323 for(Index i = 0; i < columns.size(); i++)
2325 if(columns(i).column_use == VariableUse::Target)
2327 target_columns_indices(index) = i;
2332 return target_columns_indices;
2336Tensor<Index, 1> DataSet::get_target_time_series_columns_indices()
const
2338 const Index target_columns_number = get_target_time_series_columns_number();
2340 Tensor<Index, 1> target_columns_indices(target_columns_number);
2344 for(Index i = 0; i < time_series_columns.size(); i++)
2346 if(time_series_columns(i).column_use == VariableUse::Target)
2348 target_columns_indices(index) = i;
2353 return target_columns_indices;
2363 Tensor<Index, 1> unused_columns_indices(unused_columns_number);
2367 for(Index i = 0; i < unused_columns_number; i++)
2370 if(columns(i).column_use == VariableUse::UnusedVariable)
2372 unused_columns_indices(index) = i;
2377 return unused_columns_indices;
2389 Tensor<Index, 1> used_indices(used_variables_number);
2393 for(Index i = 0; i < variables_number; i++)
2395 if(columns(i).column_use == VariableUse::Input
2396 || columns(i).column_use == VariableUse::Target
2397 || columns(i).column_use == VariableUse::Time)
2399 used_indices(index) = i;
2404 return used_indices;
2408Tensor<Scaler, 1> DataSet::get_columns_scalers()
const
2412 Tensor<Scaler, 1> columns_scalers(columns_number);
2414 for(Index i = 0; i < columns_number; i++)
2416 columns_scalers(i) = columns(i).scaler;
2419 return columns_scalers;
2423Tensor<Scaler, 1> DataSet::get_input_variables_scalers()
const
2430 Tensor<Scaler, 1> input_variables_scalers(input_variables_number);
2434 for(Index i = 0; i < input_columns_number; i++)
2436 for(Index j = 0; j < input_columns(i).get_variables_number(); j++)
2438 input_variables_scalers(index) = input_columns(i).scaler;
2443 return input_variables_scalers;
2447Tensor<Scaler, 1> DataSet::get_target_variables_scalers()
const
2454 Tensor<Scaler, 1> target_variables_scalers(target_variables_number);
2458 for(Index i = 0; i < target_columns_number; i++)
2460 for(Index j = 0; j < target_columns(i).get_variables_number(); j++)
2462 target_variables_scalers(index) = target_columns(i).scaler;
2467 return target_variables_scalers;
2477 Tensor<string, 1> columns_names(columns_number);
2479 for(Index i = 0; i < columns_number; i++)
2481 columns_names(i) = columns(i).name;
2484 return columns_names;
2488Tensor<string, 1> DataSet::get_time_series_columns_names()
const
2492 Tensor<string, 1> columns_names(columns_number);
2494 for(Index i = 0; i < columns_number; i++)
2496 columns_names(i) = time_series_columns(i).name;
2499 return columns_names;
2509 Tensor<string, 1> input_columns_names(input_columns_number);
2513 for(Index i = 0; i < columns.size(); i++)
2515 if(columns(i).column_use == VariableUse::Input)
2517 input_columns_names(index) = columns(i).name;
2522 return input_columns_names;
2532 Tensor<string, 1> target_columns_names(target_columns_number);
2536 for(Index i = 0; i < columns.size(); i++)
2538 if(columns(i).column_use == VariableUse::Target)
2540 target_columns_names(index) = columns(i).name;
2545 return target_columns_names;
2556 Tensor<string, 1> names(used_columns_number);
2560 for(Index i = 0; i < columns_number; i++)
2562 if(columns(i).column_use != VariableUse::UnusedVariable)
2564 names(index) = columns(i).name;
2577 Index input_columns_number = 0;
2579 for(Index i = 0; i < columns.size(); i++)
2581 if(columns(i).column_use == VariableUse::Input)
2583 input_columns_number++;
2587 return input_columns_number;
2591Index DataSet::get_input_time_series_columns_number()
const
2593 Index input_columns_number = 0;
2595 for(Index i = 0; i < time_series_columns.size(); i++)
2597 if(time_series_columns(i).column_use == VariableUse::Input)
2599 input_columns_number++;
2603 return input_columns_number;
2611 Index target_columns_number = 0;
2613 for(Index i = 0; i < columns.size(); i++)
2615 if(columns(i).column_use == VariableUse::Target)
2617 target_columns_number++;
2621 return target_columns_number;
2625Index DataSet::get_target_time_series_columns_number()
const
2627 Index target_columns_number = 0;
2629 for(Index i = 0; i < time_series_columns.size(); i++)
2631 if(time_series_columns(i).column_use == VariableUse::Target)
2633 target_columns_number++;
2637 return target_columns_number;
2645 Index time_columns_number = 0;
2647 for(Index i = 0; i < columns.size(); i++)
2649 if(columns(i).column_use == VariableUse::Time)
2651 time_columns_number++;
2655 return time_columns_number;
2663 Index unused_columns_number = 0;
2665 for(Index i = 0; i < columns.size(); i++)
2667 if(columns(i).column_use == VariableUse::UnusedVariable)
2669 unused_columns_number++;
2673 return unused_columns_number;
2681 Index used_columns_number = 0;
2683 for(Index i = 0; i < columns.size(); i++)
2685 if(columns(i).column_use != VariableUse::UnusedVariable)
2687 used_columns_number++;
2691 return used_columns_number;
2703Tensor<DataSet::Column, 1> DataSet::get_time_series_columns()
const
2705 return time_series_columns;
2709Index DataSet::get_time_series_data_rows_number()
const
2721 Tensor<Column, 1> input_columns(inputs_number);
2722 Index input_index = 0;
2724 for(Index i = 0; i < columns.size(); i++)
2726 if(columns(i).column_use == VariableUse::Input)
2728 input_columns(input_index) = columns(i);
2733 return input_columns;
2743 Tensor<bool, 1> input_columns_binary(columns_number);
2745 for(Index i = 0; i < columns_number; i++)
2747 if(columns(i).column_use == VariableUse::Input)
2748 input_columns_binary(i) =
true;
2750 input_columns_binary(i) =
false;
2753 return input_columns_binary;
2763 Tensor<Column, 1> target_columns(targets_number);
2764 Index target_index = 0;
2766 for(Index i = 0; i < columns.size(); i++)
2768 if(columns(i).column_use == VariableUse::Target)
2770 target_columns(target_index) = columns(i);
2775 return target_columns;
2787 Tensor<DataSet::Column, 1> used_columns(used_columns_number);
2789 for(Index i = 0; i < used_columns_number; i++)
2791 used_columns(i) = columns(used_columns_indices(i));
2794 return used_columns;
2802 return columns.size();
2809 return time_series_columns.size();
2816 Index variables_number = 0;
2818 for (Index i = 0; i < columns.size(); i++)
2820 if (columns(i).type == ColumnType::Categorical)
2822 variables_number += columns(i).categories.size();
2830 return variables_number;
2837 Index variables_number = 0;
2839 for(Index i = 0; i < time_series_columns.size(); i++)
2841 if(columns(i).type == ColumnType::Categorical)
2843 variables_number += time_series_columns(i).categories.size();
2851 return variables_number;
2861 Index inputs_number = 0;
2863 for(Index i = 0; i < columns.size(); i++)
2865 if(columns(i).type == ColumnType::Categorical)
2867 for(Index j = 0; j < columns(i).categories_uses.size(); j++)
2869 if(columns(i).categories_uses(j) == VariableUse::Input) inputs_number++;
2872 else if(columns(i).column_use == VariableUse::Input)
2878 return inputs_number;
2886 Index targets_number = 0;
2888 for(Index i = 0; i < columns.size(); i++)
2890 if(columns(i).type == ColumnType::Categorical)
2892 for(Index j = 0; j < columns(i).categories_uses.size(); j++)
2894 if(columns(i).categories_uses(j) == VariableUse::Target) targets_number++;
2898 else if(columns(i).column_use == VariableUse::Target)
2904 return targets_number;
2912 Index unused_number = 0;
2914 for(Index i = 0; i < columns.size(); i++)
2916 if(columns(i).type == ColumnType::Categorical)
2918 for(Index j = 0; j < columns(i).categories_uses.size(); j++)
2920 if(columns(i).categories_uses(j) == VariableUse::UnusedVariable) unused_number++;
2924 else if(columns(i).column_use == VariableUse::UnusedVariable)
2930 return unused_number;
2943 for(Index i = 0; i < variables_number; i++)
2945 if(variables_names(i) == name)
return i;
2962 Tensor<Index, 1> unused_indices(unused_number);
2964 Index unused_index = 0;
2965 Index unused_variable_index = 0;
2967 for(Index i = 0; i < columns.size(); i++)
2969 if(columns(i).type == ColumnType::Categorical)
2971 const Index current_categories_number = columns(i).get_categories_number();
2973 for(Index j = 0; j < current_categories_number; j++)
2975 if(columns(i).categories_uses(j) == VariableUse::UnusedVariable)
2977 unused_indices(unused_index) = unused_variable_index;
2981 unused_variable_index++;
2984 else if(columns(i).column_use == VariableUse::UnusedVariable)
2986 unused_indices(unused_index) = i;
2988 unused_variable_index++;
2992 unused_variable_index++;
2996 return unused_indices;
3006 Tensor<Index, 1> used_indices(used_number);
3008 Index used_index = 0;
3009 Index used_variable_index = 0;
3011 for(Index i = 0; i < columns.size(); i++)
3013 if(columns(i).type == ColumnType::Categorical)
3015 const Index current_categories_number = columns(i).get_categories_number();
3017 for(Index j = 0; j < current_categories_number; j++)
3019 if(columns(i).categories_uses(j) != VariableUse::UnusedVariable)
3021 used_indices(used_index) = used_variable_index;
3025 used_variable_index++;
3028 else if(columns(i).column_use != VariableUse::UnusedVariable)
3030 used_indices(used_index) = used_variable_index;
3032 used_variable_index++;
3036 used_variable_index++;
3040 return used_indices;
3053 Tensor<Index, 1> input_variables_indices(inputs_number);
3055 Index input_index = 0;
3056 Index input_variable_index = 0;
3058 for(Index i = 0; i < columns.size(); i++)
3061 if(columns(i).type == ColumnType::Categorical)
3063 const Index current_categories_number = columns(i).get_categories_number();
3065 for(Index j = 0; j < current_categories_number; j++)
3067 if(columns(i).categories_uses(j) == VariableUse::Input)
3069 input_variables_indices(input_index) = input_variable_index;
3073 input_variable_index++;
3076 else if(columns(i).column_use == VariableUse::Input)
3078 input_variables_indices(input_index) = input_variable_index;
3080 input_variable_index++;
3084 input_variable_index++;
3088 return input_variables_indices;
3100 Tensor<Index, 1> target_variables_indices(targets_number);
3102 Index target_index = 0;
3103 Index target_variable_index = 0;
3105 for(Index i = 0; i < columns.size(); i++)
3107 if(columns(i).type == ColumnType::Categorical)
3109 const Index current_categories_number = columns(i).get_categories_number();
3111 for(Index j = 0; j < current_categories_number; j++)
3113 if(columns(i).categories_uses(j) == VariableUse::Target)
3115 target_variables_indices(target_index) = target_variable_index;
3119 target_variable_index++;
3122 else if(columns(i).column_use == VariableUse::Target)
3124 target_variables_indices(target_index) = target_variable_index;
3126 target_variable_index++;
3130 target_variable_index++;
3134 return target_variables_indices;
3144 const Index new_columns_uses_size = new_columns_uses.size();
3146 if(new_columns_uses_size != columns.size())
3148 ostringstream buffer;
3150 buffer <<
"OpenNN Exception: DataSet class.\n"
3151 <<
"void set_columns_uses(const Tensor<string, 1>&) method.\n"
3152 <<
"Size of columns uses ("
3153 << new_columns_uses_size <<
") must be equal to columns size ("
3154 << columns.size() <<
"). \n";
3156 throw logic_error(buffer.str());
3159 for(Index i = 0; i < new_columns_uses.size(); i++)
3161 columns(i).set_use(new_columns_uses(i));
3164 input_variables_dimensions.resize(1);
3175 const Index new_columns_uses_size = new_columns_uses.size();
3177 if(new_columns_uses_size != columns.size())
3179 ostringstream buffer;
3181 buffer <<
"OpenNN Exception: DataSet class.\n"
3182 <<
"void set_columns_uses(const Tensor<string, 1>&) method.\n"
3183 <<
"Size of columns uses (" << new_columns_uses_size <<
") must be equal to columns size (" << columns.size() <<
"). \n";
3185 throw logic_error(buffer.str());
3188 for(Index i = 0; i < new_columns_uses.size(); i++)
3190 columns(i).set_use(new_columns_uses(i));
3193 input_variables_dimensions.resize(1);
3204 for(Index i = 0; i < columns_number; i++)
3211void DataSet::set_input_target_columns(
const Tensor<Index, 1>& input_columns,
const Tensor<Index, 1>& target_columns)
3215 for(Index i = 0; i < input_columns.size(); i++)
3220 for(Index i = 0; i < target_columns.size(); i++)
3233 for(Index i = 0; i < columns_number; i++)
3235 if(columns(i).column_use == DataSet::VariableUse::Input)
set_column_use(i, VariableUse::UnusedVariable);
3241void DataSet::set_input_columns(
const Tensor<Index, 1>& input_columns_indices,
const Tensor<bool, 1>& input_columns_use)
3243 for(Index i = 0; i < input_columns_indices.size(); i++)
3245 if(input_columns_use(i))
set_column_use(input_columns_indices(i), VariableUse::Input);
3246 else set_column_use(input_columns_indices(i), VariableUse::UnusedVariable);
3257 columns(index).column_use = new_use;
3259 if(columns(index).type == ColumnType::Categorical)
3261 columns(index).set_categories_uses(new_use);
3277void DataSet::set_column_type(
const Index& index,
const ColumnType& new_type)
3279 columns[index].type = new_type;
3283void DataSet::set_column_type(
const string& name,
const ColumnType& new_type)
3287 set_column_type(index, new_type);
3301 if(variable_index >= variables_number)
3303 ostringstream buffer;
3305 buffer <<
"OpenNN Exception: Variables class.\n"
3306 <<
"void set_name(const Index&, const string&) method.\n"
3307 <<
"Index of variable must be less than number of variables.\n";
3309 throw logic_error(buffer.str());
3318 for(Index i = 0; i < columns_number; i++)
3320 if(columns(i).type == ColumnType::Categorical)
3322 for(Index j = 0; j < columns(i).get_categories_number(); j++)
3324 if(index == variable_index)
3326 columns(i).categories(j) = new_variable_name;
3337 if(index == variable_index)
3339 columns(i).name = new_variable_name;
3361 const Index size = new_variables_names.size();
3363 if(size != variables_number)
3365 ostringstream buffer;
3367 buffer <<
"OpenNN Exception: Variables class.\n"
3368 <<
"void set_names(const Tensor<string, 1>&) method.\n"
3369 <<
"Size (" << size <<
") must be equal to number of variables (" << variables_number <<
").\n";
3371 throw logic_error(buffer.str());
3380 for(Index i = 0; i < columns_number; i++)
3382 if(columns(i).type == ColumnType::Categorical)
3384 for(Index j = 0; j < columns(i).get_categories_number(); j++)
3386 columns(i).categories(j) = new_variables_names(index);
3392 columns(i).name = new_variables_names(index);
3405 const Index new_names_size = new_names.size();
3408 if(new_names_size != columns_number)
3410 ostringstream buffer;
3412 buffer <<
"OpenNN Exception: DataSet class.\n"
3413 <<
"void set_columns_names(const Tensor<string, 1>&).\n"
3414 <<
"Size of names (" << new_names.size() <<
") is not equal to columns number (" << columns_number <<
").\n";
3416 throw logic_error(buffer.str());
3419 for(Index i = 0; i < columns_number; i++)
3421 columns(i).name = new_names(i);
3430 for(Index i = 0; i < columns.size(); i++)
3432 if(columns(i).type == ColumnType::Constant)
continue;
3434 columns(i).set_use(VariableUse::Input);
3443 for(Index i = 0; i < columns.size(); i++)
3445 columns(i).set_use(VariableUse::Target);
3454 for(Index i = 0; i < columns.size(); i++)
3456 columns(i).set_use(VariableUse::UnusedVariable);
3467 columns.resize(new_columns_number);
3473void DataSet::set_columns_scalers(
const Scaler& scalers)
3477 for(Index i = 0; i < columns_number; i++)
3479 columns(i).scaler = scalers;
3483void DataSet::set_binary_simple_columns()
3485 bool is_binary =
true;
3487 Index variable_index = 0;
3489 Index different_values = 0;
3491 for(Index column_index = 0; column_index < columns.size(); column_index++)
3493 if(columns(column_index).type == ColumnType::Numeric)
3495 Tensor<type, 1> values(3);
3497 different_values = 0;
3500 for(Index row_index = 0; row_index <
data.dimension(0); row_index++)
3502 if(!
isnan(
data(row_index, variable_index))
3503 &&
data(row_index, variable_index) != values(0)
3504 &&
data(row_index, variable_index) != values(1))
3506 values(different_values) =
data(row_index, variable_index);
3511 if(row_index == (
data.dimension(0)-1)){
3512 if(different_values == 1){
3518 if(different_values > 2)
3527 columns(column_index).type = ColumnType::Binary;
3528 scale_minimum_maximum_binary(
data, values(0), values(1), column_index);
3529 columns(column_index).categories.resize(2);
3531 if(values(0) == type(0) && values(1) == type(1))
3533 columns(column_index).categories(0) =
"Negative (0)";
3534 columns(column_index).categories(1) =
"Positive (1)";
3536 else if(values(0) == type(1) && values(1) == type(0))
3538 columns(column_index).categories(0) =
"Positive (1)";
3539 columns(column_index).categories(1) =
"Negative (0)";
3543 columns(column_index).categories(0) =
"Class_1";
3544 columns(column_index).categories(1) =
"Class_2";
3547 const VariableUse column_use = columns(column_index).column_use;
3548 columns(column_index).categories_uses.resize(2);
3549 columns(column_index).categories_uses(0) = column_use;
3550 columns(column_index).categories_uses(1) = column_use;
3555 else if(columns(column_index).type == ColumnType::Categorical)
3557 variable_index += columns(column_index).get_categories_number();
3566void DataSet::check_constant_columns()
3568 if(
display) cout <<
"Checking constant columns..." << endl;
3570 Index variable_index = 0;
3574 if(columns(column).type == ColumnType::Numeric)
3578 const Tensor<type, 1> numeric_column =
data.chip(variable_index, 1);
3580 if(standard_deviation(numeric_column) <
static_cast<type
>(1.0e-3))
3582 columns(column).type = ColumnType::Constant;
3583 columns(column).column_use = VariableUse::UnusedVariable;
3588 else if(columns(column).type == ColumnType::DateTime)
3590 columns(column).column_use = VariableUse::UnusedVariable;
3593 else if(columns(column).type == ColumnType::Constant)
3597 else if(columns(column).type == ColumnType::Binary)
3599 if(columns(column).get_categories_number() == 1)
3601 columns(column).type = ColumnType::Constant;
3602 columns(column).column_use = VariableUse::UnusedVariable;
3607 else if(columns(column).type == ColumnType::Categorical)
3609 if(columns(column).get_categories_number() == 1)
3611 columns(column).type = ColumnType::Constant;
3612 columns(column).column_use = VariableUse::UnusedVariable;
3615 variable_index += columns(column).get_categories_number();
3620Tensor<type, 2> DataSet::transform_binary_column(
const Tensor<type, 1>& column)
const
3623 const Index rows_number = column.dimension(0);
3625 Tensor<type, 2> new_column(rows_number , 2);
3626 new_column.setZero();
3628 for(Index i = 0; i < rows_number; i++)
3630 if(
abs(column(i) -
static_cast<type
>(1)) < type(NUMERIC_LIMITS_MIN))
3632 new_column(i,1) =
static_cast<type
>(1);
3634 else if(
abs(column(i)) < type(NUMERIC_LIMITS_MIN))
3636 new_column(i,0) =
static_cast<type
>(1);
3640 new_column(i,0) = type(NAN);
3641 new_column(i,1) = type(NAN);
3652 input_variables_dimensions = new_inputs_dimensions;
3660 if(
data.dimension(0) == 0 ||
data.dimension(1) == 0)
3679Tensor<type, 2>* DataSet::get_data_pointer()
3726Tensor<string, 1> DataSet::get_rows_label_tensor()
const
3731Tensor<string, 1> DataSet::get_testing_rows_label_tensor()
3735 Tensor<string, 1> testing_rows_label(testing_samples_number);
3737 for(Index i = 0; i < testing_samples_number; i++)
3739 testing_rows_label(i) = rows_labels(testing_indices(i));
3742 return testing_rows_label;
3746Tensor<string, 1> DataSet::get_selection_rows_label_tensor()
3750 Tensor<string, 1> selection_rows_label(selection_samples_number);
3752 for(Index i = 0; i < selection_samples_number; i++)
3754 selection_rows_label(i) = rows_labels(selection_indices(i));
3757 return selection_rows_label;
3775 case Separator::Space:
3778 case Separator::Tab:
3781 case Separator::Comma:
3784 case Separator::Semicolon:
3798 case Separator::Space:
3801 case Separator::Tab:
3804 case Separator::Comma:
3807 case Separator::Semicolon:
3847Index DataSet::get_time_series_time_column_index()
const
3849 for(Index i = 0; i < time_series_columns.size(); i++)
3851 if(time_series_columns(i).type == ColumnType::DateTime)
return i;
3854 return static_cast<Index
>(NAN);
3863 if(scaling_unscaling_method ==
"NoScaling")
3865 return Scaler::NoScaling;
3867 else if(scaling_unscaling_method ==
"MinimumMaximum")
3869 return Scaler::MinimumMaximum;
3871 else if(scaling_unscaling_method ==
"Logarithmic")
3873 return Scaler::Logarithm;
3875 else if(scaling_unscaling_method ==
"MeanStandardDeviation")
3877 return Scaler::MeanStandardDeviation;
3879 else if(scaling_unscaling_method ==
"StandardDeviation")
3881 return Scaler::StandardDeviation;
3885 ostringstream buffer;
3887 buffer <<
"OpenNN Exception: DataSet class.\n"
3888 <<
"static Scaler get_scaling_unscaling_method(const string).\n"
3889 <<
"Unknown scaling-unscaling method: " << scaling_unscaling_method <<
".\n";
3891 throw logic_error(buffer.str());
3911 return get_subtensor_data(training_indices, variables_indices);
3927 Tensor<Index, 1> variables_indices;
3928 initialize_sequential(variables_indices, 0, 1, variables_number-1);
3930 return get_subtensor_data(selection_indices, variables_indices);
3942 Tensor<Index, 1> variables_indices;
3943 initialize_sequential(variables_indices, 0, 1, variables_number-1);
3947 return get_subtensor_data(testing_indices, variables_indices);
3957 const Index samples_number = get_samples_number();
3959 Tensor<Index, 1> indices;
3960 initialize_sequential(indices, 0, 1, samples_number-1);
3964 return get_subtensor_data(indices, input_variables_indices);
3978 return get_subtensor_data(indices, target_variables_indices);
3990 return get_subtensor_data(samples_indices, input_variables_indices);
4002 return get_subtensor_data(samples_indices, target_variables_indices);
4016 return get_subtensor_data(training_indices, input_variables_indices);
4030 return get_subtensor_data(training_indices, target_variables_indices);
4044 return get_subtensor_data(selection_indices, input_variables_indices);
4058 return get_subtensor_data(selection_indices, target_variables_indices);
4072 return get_subtensor_data(testing_indices, input_variables_indices);
4086 return get_subtensor_data(testing_indices, target_variables_indices);
4098 const Index samples_number = get_samples_number();
4100 if(index >= samples_number)
4102 ostringstream buffer;
4104 buffer <<
"OpenNN Exception: DataSet class.\n"
4105 <<
"Tensor<type, 1> get_sample(const Index&) const method.\n"
4106 <<
"Index of sample (" << index <<
") must be less than number of samples (" << samples_number <<
").\n";
4108 throw logic_error(buffer.str());
4115 return data.chip(index,0);
4127 const Index samples_number = get_samples_number();
4129 if(sample_index >= samples_number)
4131 ostringstream buffer;
4133 buffer <<
"OpenNN Exception: DataSet class.\n"
4134 <<
"Tensor<type, 1> get_sample(const Index&, const Tensor<Index, 1>&) const method.\n"
4135 <<
"Index of sample must be less than number of \n";
4137 throw logic_error(buffer.str());
4142 const Index variables_number = variables_indices.size();
4144 Tensor<type, 1 > row(variables_number);
4146 for(Index i = 0; i < variables_number; i++)
4148 Index variable_index = variables_indices(i);
4150 row(i) =
data(sample_index, variable_index);
4169 Tensor<type, 2> inputs(1, input_variables_number);
4171 for(Index i = 0; i < input_variables_number; i++)
4172 inputs(0, i) =
data(sample_index, input_variables_indices(i));
4185 return get_subtensor_data(Tensor<Index, 1>(sample_index), target_variables_indices);
4196 for(Index i = 0; i < columns_number; i++)
4198 if(columns(i).name == column_name)
return i;
4201 ostringstream buffer;
4203 buffer <<
"OpenNN Exception: DataSet class.\n"
4204 <<
"Index get_column_index(const string&&) const method.\n"
4205 <<
"Cannot find " << column_name <<
"\n";
4207 throw logic_error(buffer.str());
4218 Index total_variables_number = 0;
4220 for(Index i = 0; i < columns_number; i++)
4222 if(columns(i).type == ColumnType::Categorical)
4224 total_variables_number += columns(i).get_categories_number();
4228 total_variables_number++;
4231 if((variable_index+1) <= total_variables_number)
return i;
4234 ostringstream buffer;
4236 buffer <<
"OpenNN Exception: DataSet class.\n"
4237 <<
"Index get_column_index(const type&) const method.\n"
4238 <<
"Cannot find variable index: " << variable_index <<
".\n";
4240 throw logic_error(buffer.str());
4252 for(Index i = 0; i < column_index; i++)
4254 if(columns(i).type == ColumnType::Categorical)
4256 index += columns(i).categories.size();
4264 if(columns(column_index).type == ColumnType::Categorical)
4266 Tensor<Index, 1> variable_indices(columns(column_index).categories.size());
4268 for(Index j = 0; j<columns(column_index).categories.size(); j++)
4270 variable_indices(j) = index+j;
4273 return variable_indices;
4277 Tensor<Index, 1> indices(1);
4278 indices.setConstant(index);
4291 Index columns_number = 1;
4292 const Index rows_number =
data.dimension(0);
4294 if(columns(column_index).type == ColumnType::Categorical)
4296 columns_number = columns(column_index).get_categories_number();
4299 const Eigen::array<Index, 2> extents = {rows_number, columns_number};
4302 return data.slice(offsets, extents);
4311 Index columns_number = 1;
4315 if(time_series_columns(column_index).type == ColumnType::Categorical)
4317 columns_number = time_series_columns(column_index).get_categories_number();
4320 const Eigen::array<Index, 2> extents = {rows_number, columns_number};
4360 if(index >= variables_number)
4362 ostringstream buffer;
4364 buffer <<
"OpenNN Exception: DataSet class.\n"
4365 <<
"Tensor<type, 1> get_variable(const Index&) const method.\n"
4366 <<
"Index of variable must be less than number of \n";
4368 throw logic_error(buffer.str());
4373 return data.chip(index, 1);
4387 for(Index i = 0; i < variable_names.size(); i++)
4389 if(variable_names(i) == variable_name) size++;
4392 Tensor<Index, 1> variable_index(size);
4396 for(Index i = 0; i < variable_names.size(); i++)
4398 if(variable_names(i) == variable_name)
4400 variable_index(index) = i;
4408 const Index variables_size = variable_index.size();
4410 if(variables_size == 0)
4412 ostringstream buffer;
4414 buffer <<
"OpenNN Exception: DataSet class.\n"
4415 <<
"Tensor<type, 1> get_variable(const string&) const method.\n"
4416 <<
"Variable: " << variable_name <<
" does not exist.\n";
4418 throw logic_error(buffer.str());
4421 if(variables_size > 1)
4423 ostringstream buffer;
4425 buffer <<
"OpenNN Exception: DataSet class.\n"
4426 <<
"Tensor<type, 1> get_variable(const string&) const method.\n"
4427 <<
"Variable: " << variable_name <<
" appears more than once in the data set.\n";
4429 throw logic_error(buffer.str());
4434 return data.chip(variable_index(0), 1);
4449 if(variable_index >= variables_number)
4451 ostringstream buffer;
4453 buffer <<
"OpenNN Exception: DataSet class.\n"
4454 <<
"Tensor<type, 1> get_variable(const Index&, const Tensor<Index, 1>&) const method.\n"
4455 <<
"Index of variable must be less than number of \n";
4457 throw logic_error(buffer.str());
4462 const Index samples_indices_size = samples_indices.size();
4464 Tensor<type, 1 > column(samples_indices_size);
4466 for(Index i = 0; i < samples_indices_size; i++)
4468 Index sample_index = samples_indices(i);
4470 column(i) =
data(sample_index, variable_index);
4488 for(Index i = 0; i < variable_names.size(); i++)
4490 if(variable_names(i) == variable_name) size++;
4493 Tensor<Index, 1> variable_index(size);
4497 for(Index i = 0; i < variable_names.size(); i++)
4499 if(variable_names(i) == variable_name)
4501 variable_index(index) = i;
4509 const Index variables_size = variable_index.size();
4511 if(variables_size == 0)
4513 ostringstream buffer;
4515 buffer <<
"OpenNN Exception: DataSet class.\n"
4516 <<
"Tensor<type, 1> get_variable(const string&) const method.\n"
4517 <<
"Variable: " << variable_name <<
" does not exist.\n";
4519 throw logic_error(buffer.str());
4522 if(variables_size > 1)
4524 ostringstream buffer;
4526 buffer <<
"OpenNN Exception: DataSet class.\n"
4527 <<
"Tensor<type, 1> get_variable(const string&, const Tensor<Index, 1>&) const method.\n"
4528 <<
"Variable: " << variable_name <<
" appears more than once in the data set.\n";
4530 throw logic_error(buffer.str());
4535 const Index samples_indices_size = samples_indices.size();
4537 Tensor<type, 1 > column(samples_indices_size);
4539 for(Index i = 0; i < samples_indices_size; i++)
4541 Index sample_index = samples_indices(i);
4543 column(i) =
data(sample_index, variable_index(0));
4550Tensor<Tensor<string, 1>, 1> DataSet::get_data_file_preview()
const
4552 return data_file_preview;
4556Tensor<type, 2> DataSet::get_subtensor_data(
const Tensor<Index, 1> & rows_indices,
const Tensor<Index, 1> & variables_indices)
const
4558 const Index rows_number = rows_indices.size();
4559 const Index variables_number = variables_indices.size();
4561 Tensor<type, 2> subtensor(rows_number, variables_number);
4564 Index variable_index;
4568 for(Index i = 0; i < rows_number; i++)
4570 row_index = rows_indices(i);
4572 for(Index j = 0; j < variables_number; j++)
4574 variable_index = variables_indices(j);
4576 subtensor(i, j) =
data(row_index, variable_index);
4593 samples_uses.resize(0);
4599 time_series_columns.resize(0);
4602 columns_missing_values_number.resize(0);
4632 const Index variables_number = new_data.dimension(1);
4633 const Index samples_number = new_data.dimension(0);
4635 set(samples_number, variables_number);
4649void DataSet::set(
const Index& new_samples_number,
const Index& new_variables_number)
4653 if(new_samples_number == 0)
4655 ostringstream buffer;
4657 buffer <<
"OpenNN Exception: DataSet class.\n"
4658 <<
"void set(const Index&, const Index&) method.\n"
4659 <<
"Number of samples must be greater than zero.\n";
4661 throw logic_error(buffer.str());
4664 if(new_variables_number == 0)
4666 ostringstream buffer;
4668 buffer <<
"OpenNN Exception: DataSet class.\n"
4669 <<
"void set(const Index&, const Index&) method.\n"
4670 <<
"Number of variables must be greater than zero.\n";
4672 throw logic_error(buffer.str());
4677 data.resize(new_samples_number, new_variables_number);
4679 columns.resize(new_variables_number);
4681 for(Index index = 0; index < new_variables_number-1; index++)
4683 columns(index).name =
"column_" + to_string(index+1);
4684 columns(index).column_use = VariableUse::Input;
4685 columns(index).type = ColumnType::Numeric;
4688 columns(new_variables_number-1).name =
"column_" + to_string(new_variables_number);
4689 columns(new_variables_number-1).column_use = VariableUse::Target;
4690 columns(new_variables_number-1).type = ColumnType::Numeric;
4692 samples_uses.resize(new_samples_number);
4704 const Index& new_inputs_number,
4705 const Index& new_targets_number)
4710 const Index new_variables_number = new_inputs_number + new_targets_number;
4712 data.resize(new_samples_number, new_variables_number);
4714 columns.resize(new_variables_number);
4716 for(Index i = 0; i < new_variables_number; i++)
4718 if(i < new_inputs_number)
4720 columns(i).name =
"column_" + to_string(i+1);
4721 columns(i).column_use = VariableUse::Input;
4722 columns(i).type = ColumnType::Numeric;
4726 columns(i).name =
"column_" + to_string(i+1);
4727 columns(i).column_use = VariableUse::Target;
4728 columns(i).type = ColumnType::Numeric;
4732 input_variables_dimensions.resize(1);
4734 samples_uses.resize(new_samples_number);
4755 columns = other_data_set.columns;
4768 from_XML(data_set_document);
4798 delete non_blocking_thread_pool;
4799 delete thread_pool_device;
4801 const int n = omp_get_max_threads();
4802 non_blocking_thread_pool =
new NonBlockingThreadPool(n);
4803 thread_pool_device =
new ThreadPoolDevice(non_blocking_thread_pool, n);
4819 input_variables_dimensions.resize(1);
4833 const Index samples_number = new_data.dimension(0);
4834 const Index variables_number = new_data.dimension(1);
4836 set(samples_number, variables_number);
4841void DataSet::set_time_series_data(
const Tensor<type, 2>& new_data)
4847void DataSet::set_time_series_columns_number(
const Index& new_variables_number)
4849 time_series_columns.resize(new_variables_number);
4894 if(new_separator ==
' ')
4898 else if(new_separator ==
'\t')
4902 else if(new_separator ==
',')
4906 else if(new_separator ==
';')
4912 ostringstream buffer;
4914 buffer <<
"OpenNN Exception: DataSet class.\n"
4915 <<
"void set_separator(const char&) method.\n"
4916 <<
"Unknown separator: " << new_separator <<
".\n";
4918 throw logic_error(buffer.str());
4928 if(new_separator_string ==
"Space")
4932 else if(new_separator_string ==
"Tab")
4936 else if(new_separator_string ==
"Comma")
4940 else if(new_separator_string ==
"Semicolon")
4946 ostringstream buffer;
4948 buffer <<
"OpenNN Exception: DataSet class.\n"
4949 <<
"void set_separator(const string&) method.\n"
4950 <<
"Unknown separator: " << new_separator_string <<
".\n";
4952 throw logic_error(buffer.str());
4965 if(get_trimmed(new_missing_values_label).empty())
4967 ostringstream buffer;
4969 buffer <<
"OpenNN Exception: DataSet class.\n"
4970 <<
"void set_missing_values_label(const string&) method.\n"
4971 <<
"Missing values label cannot be empty.\n";
4973 throw logic_error(buffer.str());
4993 if(new_missing_values_method ==
"Unuse")
4997 else if(new_missing_values_method ==
"Mean")
5001 else if(new_missing_values_method ==
"Median")
5007 ostringstream buffer;
5009 buffer <<
"OpenNN Exception: DataSet class.\n"
5010 <<
"void set_missing_values_method(const string & method.\n"
5011 <<
"Not known method type.\n";
5013 throw logic_error(buffer.str());
5047void DataSet::set_threads_number(
const int& new_threads_number)
5049 if(non_blocking_thread_pool !=
nullptr)
delete non_blocking_thread_pool;
5050 if(thread_pool_device !=
nullptr)
delete thread_pool_device;
5052 non_blocking_thread_pool =
new NonBlockingThreadPool(new_threads_number);
5053 thread_pool_device =
new ThreadPoolDevice(non_blocking_thread_pool, new_threads_number);
5066 set(new_samples_number,variables_number);
5079 if(columns_number == 0)
5081 ostringstream buffer;
5083 buffer <<
"OpenNN Exception: DataSet class.\n"
5084 <<
"Tensor<string, 1> unuse_constant_columns() method.\n"
5085 <<
"Number of columns is zero.\n";
5087 throw logic_error(buffer.str());
5094 Tensor<string, 1> constant_columns(0);
5096 for(Index i = 0; i < columns_number; i++)
5098 if(columns(i).type == ColumnType::Constant)
5100 columns(i).set_use(VariableUse::UnusedVariable);
5101 constant_columns = push_back(constant_columns, columns(i).name);
5105 return constant_columns;
5114 const Index samples_number = get_samples_number();
5118 if(samples_number == 0)
5120 ostringstream buffer;
5122 buffer <<
"OpenNN Exception: DataSet class.\n"
5123 <<
"Tensor<Index, 1> unuse_repeated_samples() method.\n"
5124 <<
"Number of samples is zero.\n";
5126 throw logic_error(buffer.str());
5131 Tensor<Index, 1> repeated_samples;
5133 Tensor<type, 1> sample_i;
5134 Tensor<type, 1> sample_j;
5136 #pragma omp parallel for private(sample_i, sample_j) schedule(dynamic)
5138 for(Index i = 0; i < static_cast<Index>(samples_number); i++)
5142 for(Index j =
static_cast<Index
>(i+1); j < samples_number; j++)
5147 && equal(sample_i.data(), sample_i.data()+sample_i.size(), sample_j.data()))
5151 repeated_samples = push_back(repeated_samples, j);
5156 return repeated_samples;
5165 Tensor<string, 1> unused_columns;
5174 for(Index i = 0; i < input_columns_number; i++)
5176 const Index input_column_index = input_columns_indices(i);
5178 for(Index j = 0; j < target_columns_number; j++)
5180 if(!
isnan(correlations(i,j).r)
5181 &&
abs(correlations(i,j).r) < minimum_correlation
5182 && columns(input_column_index).column_use != VariableUse::UnusedVariable)
5184 columns(input_column_index).set_use(VariableUse::UnusedVariable);
5186 unused_columns = push_back(unused_columns, columns(input_column_index).name);
5191 return unused_columns;
5203 const Index columns_number = columns.size();
5206 const Index used_samples_number = used_samples_indices.size();
5208 Tensor<Histogram, 1> histograms(used_columns_number);
5210 Index variable_index = 0;
5211 Index used_column_index = 0;
5213 for(Index i = 0; i < columns_number; i++)
5215 if(columns(i).type == ColumnType::Numeric)
5217 if(columns(i).column_use == VariableUse::UnusedVariable)
5223 Tensor<type, 1> column(used_samples_number);
5225 for(Index j = 0; j < used_samples_number; j++)
5227 column(j) =
data(used_samples_indices(j), variable_index);
5230 histograms(used_column_index) = histogram(column, bins_number);
5233 used_column_index++;
5236 else if(columns(i).type == ColumnType::Categorical)
5238 const Index categories_number = columns(i).get_categories_number();
5240 if(columns(i).column_use == VariableUse::UnusedVariable)
5242 variable_index += categories_number;
5246 Tensor<Index, 1> categories_frequencies(categories_number);
5247 categories_frequencies.setZero();
5248 Tensor<type, 1> centers(categories_number);
5250 for(Index j = 0; j < categories_number; j++)
5252 for(Index k = 0; k < used_samples_number; k++)
5254 if(
abs(
data(used_samples_indices(k), variable_index) - type(1)) < type(NUMERIC_LIMITS_MIN))
5256 categories_frequencies(j)++;
5260 centers(j) =
static_cast<type
>(j);
5265 histograms(used_column_index).frequencies = categories_frequencies;
5266 histograms(used_column_index).centers = centers;
5268 used_column_index++;
5271 else if(columns(i).type == ColumnType::Binary)
5273 if(columns(i).column_use == VariableUse::UnusedVariable)
5279 Tensor<Index, 1> binary_frequencies(2);
5280 binary_frequencies.setZero();
5282 for(Index j = 0; j < used_samples_number; j++)
5284 if(
abs(
data(used_samples_indices(j), variable_index) - type(1)) < type(NUMERIC_LIMITS_MIN))
5286 binary_frequencies(0)++;
5290 binary_frequencies(1)++;
5294 histograms(used_column_index).frequencies = binary_frequencies;
5296 used_column_index++;
5328 Tensor<BoxPlot, 1> box_plots(used_columns_number);
5330 Index used_column_index = 0;
5331 Index variable_index = 0;
5333 for(Index i = 0; i < columns_number; i++)
5335 if(columns(i).type == ColumnType::Numeric || columns(i).type == ColumnType::Binary)
5337 if(columns(i).column_use != VariableUse::UnusedVariable)
5339 box_plots(used_column_index) = box_plot(
data.chip(variable_index, 1), used_samples_indices);
5341 used_column_index++;
5346 else if(columns(i).type == ColumnType::Categorical)
5348 variable_index += columns(i).get_categories_number();
5365 Index negatives = 0;
5369 const Index used_samples_number = used_indices.size();
5371 for(Index i = 0; i < used_samples_number; i++)
5373 const Index training_index = used_indices(i);
5375 if(
abs(
data(training_index, target_index)) < type(NUMERIC_LIMITS_MIN))
5379 else if(
abs(
data(training_index, target_index) - type(1)) > type(NUMERIC_LIMITS_MIN))
5381 ostringstream buffer;
5383 buffer <<
"OpenNN Exception: DataSet class.\n"
5384 <<
"Index calculate_used_negatives(const Index&) const method.\n"
5385 <<
"Training sample is neither a positive nor a negative: " <<
data(training_index, target_index) << endl;
5387 throw logic_error(buffer.str());
5400 Index negatives = 0;
5404 const Index training_samples_number = training_indices.size();
5406 for(Index i = 0; i < training_samples_number; i++)
5408 const Index training_index = training_indices(i);
5410 if(
abs(
data(training_index, target_index)) < type(NUMERIC_LIMITS_MIN))
5414 else if(
abs(
data(training_index, target_index) -
static_cast<type
>(1)) >
static_cast<type
>(1.0e-3))
5416 ostringstream buffer;
5418 buffer <<
"OpenNN Exception: DataSet class.\n"
5419 <<
"Index calculate_training_negatives(const Index&) const method.\n"
5420 <<
"Training sample is neither a positive nor a negative: " <<
data(training_index, target_index) << endl;
5422 throw logic_error(buffer.str());
5435 Index negatives = 0;
5441 for(Index i = 0; i < static_cast<Index>(selection_samples_number); i++)
5443 const Index selection_index = selection_indices(i);
5445 if(
abs(
data(selection_index, target_index)) < type(NUMERIC_LIMITS_MIN))
5449 else if(
abs(
data(selection_index, target_index) - type(1)) > type(NUMERIC_LIMITS_MIN))
5451 ostringstream buffer;
5453 buffer <<
"OpenNN Exception: DataSet class.\n"
5454 <<
"Index calculate_testing_negatives(const Index&) const method.\n"
5455 <<
"Selection sample is neither a positive nor a negative: " <<
data(selection_index, target_index) << endl;
5457 throw logic_error(buffer.str());
5470 Index negatives = 0;
5476 for(Index i = 0; i < static_cast<Index>(testing_samples_number); i++)
5478 const Index testing_index = testing_indices(i);
5480 if(
data(testing_index, target_index) < type(NUMERIC_LIMITS_MIN))
5501 return descriptives(
data);
5519 return descriptives(
data, used_samples_indices, used_variables_indices);
5532 if(targets_number != 1)
5534 ostringstream buffer;
5536 buffer <<
"OpenNN Exception: DataSet class.\n"
5537 <<
"Tensor<type, 2> calculate_columns_descriptives_positive_samples() const method.\n"
5538 <<
"Number of targets muste be 1.\n";
5540 throw logic_error(buffer.str());
5549 const Index samples_number = used_samples_indices.size();
5553 Index positive_samples_number = 0;
5555 for(Index i = 0; i < samples_number; i++)
5557 Index sample_index = used_samples_indices(i);
5559 if(
abs(
data(sample_index, target_index) - type(1)) < type(NUMERIC_LIMITS_MIN)) positive_samples_number++;
5564 Tensor<Index, 1> positive_used_samples_indices(positive_samples_number);
5565 Index positive_sample_index = 0;
5567 for(Index i = 0; i < samples_number; i++)
5569 Index sample_index = used_samples_indices(i);
5571 if(
abs(
data(sample_index, target_index) - type(1)) < type(NUMERIC_LIMITS_MIN))
5573 positive_used_samples_indices(positive_sample_index) = sample_index;
5574 positive_sample_index++;
5578 return descriptives(
data, positive_used_samples_indices, input_variables_indices);
5591 if(targets_number != 1)
5593 ostringstream buffer;
5595 buffer <<
"OpenNN Exception: DataSet class.\n"
5596 <<
"Tensor<type, 2> calculate_columns_descriptives_positive_samples() const method.\n"
5597 <<
"Number of targets muste be 1.\n";
5599 throw logic_error(buffer.str());
5608 const Index samples_number = used_samples_indices.size();
5612 Index negative_samples_number = 0;
5614 for(Index i = 0; i < samples_number; i++)
5616 Index sample_index = used_samples_indices(i);
5618 if(
data(sample_index, target_index) < type(NUMERIC_LIMITS_MIN)) negative_samples_number++;
5623 Tensor<Index, 1> negative_used_samples_indices(negative_samples_number);
5624 Index negative_sample_index = 0;
5626 for(Index i = 0; i < samples_number; i++)
5628 Index sample_index = used_samples_indices(i);
5630 if(
data(sample_index, target_index) < type(NUMERIC_LIMITS_MIN))
5632 negative_used_samples_indices(negative_sample_index) = sample_index;
5633 negative_sample_index++;
5638 return descriptives(
data, negative_used_samples_indices, input_variables_indices);
5650 const Index samples_number = used_samples_indices.size();
5654 Index class_samples_number = 0;
5656 for(Index i = 0; i < samples_number; i++)
5658 Index sample_index = used_samples_indices(i);
5660 if(
abs(
data(sample_index, class_index) - type(1)) < type(NUMERIC_LIMITS_MIN)) class_samples_number++;
5665 Tensor<Index, 1> class_used_samples_indices(class_samples_number);
5666 class_used_samples_indices.setZero();
5667 Index class_sample_index = 0;
5669 for(Index i = 0; i < samples_number; i++)
5671 Index sample_index = used_samples_indices(i);
5673 if(
abs(
data(sample_index, class_index) - type(1)) < type(NUMERIC_LIMITS_MIN))
5675 class_used_samples_indices(class_sample_index) = sample_index;
5676 class_sample_index++;
5680 return descriptives(
data, class_used_samples_indices, input_variables_indices);
5699 return descriptives(
data, training_indices, used_indices);
5718 return descriptives(
data, selection_indices, used_indices);
5732 return descriptives(
data, used_samples_indices, input_variables_indices);
5751 return descriptives(
data, used_indices, target_variables_indices);
5755Tensor<Descriptives, 1> DataSet::calculate_testing_target_variables_descriptives()
const
5761 return descriptives(
data, testing_indices, target_variables_indices);
5811 const Index variables_number = variables_indices.size();
5813 Tensor<type, 1> means(variables_number);
5816 #pragma omp parallel for
5818 for(Index i = 0; i < variables_number; i++)
5820 const Index variable_index = variables_indices(i);
5822 Tensor<type, 0> mean =
data.chip(variable_index, 1).mean();
5831Tensor<type, 1> DataSet::calculate_used_targets_mean()
const
5837 return mean(
data, used_indices, target_variables_indices);
5849 return mean(
data, selection_indices, target_variables_indices);
5886 Tensor<Correlation, 2> correlations(input_columns_number, target_columns_number);
5888 for(Index i = 0; i < input_columns_number; i++)
5890 const Index input_index = input_columns_indices(i);
5892 const Tensor<type, 2> input_column_data =
get_column_data(input_index, used_samples_indices);
5894 for(Index j = 0; j < target_columns_number; j++)
5896 const Index target_index = target_columns_indices(j);
5898 const Tensor<type, 2> target_column_data =
get_column_data(target_index, used_samples_indices);
5900 correlations(i,j) = OpenNN::correlation(thread_pool_device, input_column_data, target_column_data);
5902 cout << columns(input_index).name <<
" - " << columns(target_index).name <<
" correlation: " << correlations(i,j).r << endl;
5906 return correlations;
5914 for(Index i = 0; i <
data.size(); i++)
if(
isnan(
data(i)))
return true;
5924 for(Index j = 0; j <
data.dimension(1); j++)
5926 if(
isnan(
data(row_index,j)))
return true;
5946 const Tensor<Index, 0> columns_with_missing_values = count_nan_columns().sum();
5948 cout <<
"Columns with missing values: " << columns_with_missing_values(0)
5949 <<
" (" << columns_with_missing_values(0)*100/
data.dimension(1) <<
"%)" << endl;
5951 const Index samples_with_missing_values = count_rows_with_nan();
5953 cout <<
"Samples with missing values: "
5954 << samples_with_missing_values <<
" (" << samples_with_missing_values*100/
data.dimension(0) <<
"%)" << endl;
5970 for(Index j = 0; j < targets_number; j++)
5972 for(Index i = 0; i < inputs_number; i++)
5974 cout << targets_name(j) <<
" - " << inputs_names(i) <<
": " << correlations(i,j).r << endl;
5993 Tensor<type, 1> target_correlations(inputs_number);
5995 Tensor<string, 2> top_correlations(inputs_number, 2);
5997 map<type,string> top_correlation;
5999 for(Index i = 0 ; i < inputs_number; i++)
6001 for(Index j = 0 ; j < targets_number ; j++)
6003 top_correlation.insert(pair<type,string>(correlations(i,j), inputs_names(i) +
" - " + targets_name(j)));
6007 map<type,string>::iterator it;
6009 for(it = top_correlation.begin(); it != top_correlation.end(); it++)
6011 cout <<
"Correlation: " << (*it).first <<
" between " << (*it).second <<
"" << endl;
6025 Tensor<Correlation, 2> correlations(input_columns_number, input_columns_number);
6027 for(Index i = 0; i < input_columns_number; i++)
6029 const Index current_input_index_i = input_columns_indices(i);
6031 const Tensor<type, 2> input_i =
get_column_data(current_input_index_i);
6033 cout <<
"Calculating " << columns(current_input_index_i).name <<
" correlations. " << endl;
6035 for(Index j = i; j < input_columns_number; j++)
6037 const Index current_input_index_j = input_columns_indices(j);
6039 const Tensor<type, 2> input_j =
get_column_data(current_input_index_j);
6041 correlations(i,j) = OpenNN::correlation(thread_pool_device, input_i, input_j);
6043 if(correlations(i,j).r > type(1))
6045 correlations(i,j).r = type(1);
6050 for(Index i = 0; i < input_columns_number; i++)
6052 for(Index j = 0; j < i; j++)
6054 correlations(i,j) = correlations(j,i);
6058 return correlations;
6068 cout << inputs_correlations << endl;
6072void DataSet::print_data_file_preview()
const
6074 const Index size = data_file_preview.size();
6076 for(Index i = 0; i < size; i++)
6078 for(Index j = 0; j < data_file_preview(i).size(); j++)
6080 cout << data_file_preview(i)(j) <<
" ";
6099 const Index correlations_number = variables_number*(variables_number-1)/2;
6101 Tensor<string, 2> top_correlations(correlations_number, 3);
6103 map<type, string> top_correlation;
6105 for(Index i = 0; i < variables_number; i++)
6107 for(Index j = i; j < variables_number; j++)
6109 if(i == j)
continue;
6111 top_correlation.insert(pair<type,string>(variables_correlations(i,j), variables_name(i) +
" - " + variables_name(j)));
6115 map<type,string> ::iterator it;
6117 for(it = top_correlation.begin(); it != top_correlation.end(); it++)
6119 cout <<
"Correlation: " << (*it).first <<
" between " << (*it).second <<
"" << endl;
6130 const Index columns_number = columns.size();
6132 for(Index i = 0; i < columns_number; i++)
6134 if(columns(i).type == ColumnType::Numeric)
6136 columns(i).scaler = Scaler::MeanStandardDeviation;
6140 columns(i).scaler = Scaler::MinimumMaximum;
6146Tensor<Descriptives, 1> DataSet::scale_data()
6154 for(Index i = 0; i < variables_number; i++)
6158 switch(columns(column_index).scaler)
6160 case Scaler::NoScaling:
6164 case Scaler::MinimumMaximum:
6165 scale_minimum_maximum(
data, i, variables_descriptives(i));
6168 case Scaler::MeanStandardDeviation:
6169 scale_mean_standard_deviation(
data, i, variables_descriptives(i));
6172 case Scaler::StandardDeviation:
6173 scale_standard_deviation(
data, i, variables_descriptives(i));
6176 case Scaler::Logarithm:
6177 scale_logarithmic(
data, i);
6182 ostringstream buffer;
6184 buffer <<
"OpenNN Exception: DataSet class\n"
6185 <<
"void scale_data() method.\n"
6186 <<
"Unknown scaler: " << int(columns(i).scaler) <<
"\n";
6188 throw logic_error(buffer.str());
6193 return variables_descriptives;
6197void DataSet::unscale_data(
const Tensor<Descriptives, 1>& variables_descriptives)
6201 for(Index i = 0; i < variables_number; i++)
6203 switch(columns(i).scaler)
6205 case Scaler::NoScaling:
6209 case Scaler::MinimumMaximum:
6210 unscale_minimum_maximum(
data, i, variables_descriptives(i));
6213 case Scaler::MeanStandardDeviation:
6214 unscale_mean_standard_deviation(
data, i, variables_descriptives(i));
6217 case Scaler::StandardDeviation:
6218 unscale_standard_deviation(
data, i, variables_descriptives(i));
6221 case Scaler::Logarithm:
6222 unscale_logarithmic(
data, i);
6227 ostringstream buffer;
6229 buffer <<
"OpenNN Exception: DataSet class\n"
6230 <<
"void unscale_data() method.\n"
6231 <<
"Unknown scaler: " << int(columns(i).scaler) <<
"\n";
6233 throw logic_error(buffer.str());
6248 const Tensor<Scaler, 1> input_variables_scalers = get_input_variables_scalers();
6252 for(Index i = 0; i < input_variables_number; i++)
6254 switch(input_variables_scalers(i))
6256 case Scaler::NoScaling:
6260 case Scaler::MinimumMaximum:
6261 scale_minimum_maximum(
data, input_variables_indices(i), input_variables_descriptives(i));
6264 case Scaler::MeanStandardDeviation:
6265 scale_mean_standard_deviation(
data, input_variables_indices(i), input_variables_descriptives(i));
6268 case Scaler::StandardDeviation:
6269 scale_standard_deviation(
data, input_variables_indices(i), input_variables_descriptives(i));
6272 case Scaler::Logarithm:
6273 scale_logarithmic(
data, input_variables_indices(i));
6278 ostringstream buffer;
6280 buffer <<
"OpenNN Exception: DataSet class\n"
6281 <<
"void scale_input_variables(const Tensor<string, 1>&, const Tensor<Descriptives, 1>&) method.\n"
6282 <<
"Unknown scaling and unscaling method: " << int(input_variables_scalers(i)) <<
"\n";
6284 throw logic_error(buffer.str());
6289 return input_variables_descriptives;
6303 const Tensor<Scaler, 1> target_variables_scalers = get_target_variables_scalers();
6307 for(Index i = 0; i < target_variables_number; i++)
6309 switch(target_variables_scalers(i))
6311 case Scaler::NoScaling:
6315 case Scaler::MinimumMaximum:
6316 scale_minimum_maximum(
data, target_variables_indices(i), target_variables_descriptives(i));
6319 case Scaler::MeanStandardDeviation:
6320 scale_mean_standard_deviation(
data, target_variables_indices(i), target_variables_descriptives(i));
6323 case Scaler::StandardDeviation:
6324 scale_standard_deviation(
data, target_variables_indices(i), target_variables_descriptives(i));
6327 case Scaler::Logarithm:
6328 scale_logarithmic(
data, target_variables_indices(i));
6333 ostringstream buffer;
6335 buffer <<
"OpenNN Exception: DataSet class\n"
6336 <<
"void scale_input_variables(const Tensor<string, 1>&, const Tensor<Descriptives, 1>&) method.\n"
6337 <<
"Unknown scaling and unscaling method: " << int(target_variables_scalers(i)) <<
"\n";
6339 throw logic_error(buffer.str());
6344 return target_variables_descriptives;
6357 const Tensor<Scaler, 1> input_variables_scalers = get_input_variables_scalers();
6359 for(Index i = 0; i < input_variables_number; i++)
6361 switch(input_variables_scalers(i))
6363 case Scaler::NoScaling:
6367 case Scaler::MinimumMaximum:
6368 unscale_minimum_maximum(
data, input_variables_indices(i), input_variables_descriptives(i));
6371 case Scaler::MeanStandardDeviation:
6372 unscale_mean_standard_deviation(
data, input_variables_indices(i), input_variables_descriptives(i));
6375 case Scaler::StandardDeviation:
6376 unscale_standard_deviation(
data, input_variables_indices(i), input_variables_descriptives(i));
6381 ostringstream buffer;
6383 buffer <<
"OpenNN Exception: DataSet class\n"
6384 <<
"void unscale_input_variables(const Tensor<string, 1>&, const Tensor<Descriptives, 1>&) method.\n"
6385 <<
"Unknown unscaling and unscaling method: " << int(input_variables_scalers(i)) <<
"\n";
6387 throw logic_error(buffer.str());
6401 const Tensor<Scaler, 1> target_variables_scalers = get_target_variables_scalers();
6403 for(Index i = 0; i < target_variables_number; i++)
6405 switch(target_variables_scalers(i))
6407 case Scaler::NoScaling:
6410 case Scaler::MinimumMaximum:
6411 unscale_minimum_maximum(
data, target_variables_indices(i), targets_descriptives(i));
6414 case Scaler::MeanStandardDeviation:
6415 unscale_mean_standard_deviation(
data, target_variables_indices(i), targets_descriptives(i));
6418 case Scaler::Logarithm:
6419 unscale_logarithmic(
data, target_variables_indices(i));
6424 ostringstream buffer;
6426 buffer <<
"OpenNN Exception: DataSet class\n"
6427 <<
"void unscale_targets(const Tensor<Descriptives, 1>&) method.\n"
6428 <<
"Unknown unscaling and unscaling method.\n";
6430 throw logic_error(buffer.str());
6442 data.setConstant(new_value);
6462 const Index samples_number =
data.dimension(0);
6463 const Index variables_number =
data.dimension(1);
6466 const Index target_variables_number = variables_number - input_variables_number;
6468 Index target_variable_index = 0;
6470 for(Index i = 0; i < samples_number; i++)
6472 if(target_variables_number == 1) target_variable_index = rand()%2;
6473 else target_variable_index = rand()%(variables_number-input_variables_number)+input_variables_number;
6475 for(Index j = input_variables_number; j < variables_number; j++)
6477 if(target_variables_number == 1)
data(i,j) =
static_cast<type
>(target_variable_index);
6478 else data(i,j) = (j == target_variable_index) ? type(1) : type(0);
6488 ostringstream buffer;
6490 file_stream.OpenElement(
"DataSet");
6494 file_stream.OpenElement(
"DataFile");
6498 file_stream.OpenElement(
"FileType");
6507 file_stream.OpenElement(
"DataFileName");
6516 file_stream.OpenElement(
"Separator");
6525 file_stream.OpenElement(
"ColumnsNames");
6530 file_stream.
PushText(buffer.str().c_str());
6537 file_stream.OpenElement(
"RowsLabels");
6543 file_stream.
PushText(buffer.str().c_str());
6550 file_stream.OpenElement(
"MissingValuesLabel");
6559 file_stream.OpenElement(
"LagsNumber");
6564 file_stream.
PushText(buffer.str().c_str());
6571 file_stream.OpenElement(
"StepsAhead");
6576 file_stream.
PushText(buffer.str().c_str());
6583 file_stream.OpenElement(
"TimeColumn");
6588 file_stream.
PushText(buffer.str().c_str());
6598 file_stream.OpenElement(
"Columns");
6602 file_stream.OpenElement(
"ColumnsNumber");
6607 file_stream.
PushText(buffer.str().c_str());
6617 for(Index i = 0; i < columns_number; i++)
6619 file_stream.OpenElement(
"Column");
6623 columns(i).write_XML(file_stream);
6637 if(time_series_columns_number != 0)
6639 file_stream.OpenElement(
"TimeSeriesColumns");
6643 file_stream.OpenElement(
"TimeSeriesColumnsNumber");
6648 file_stream.
PushText(buffer.str().c_str());
6655 for(Index i = 0; i < time_series_columns_number; i++)
6657 file_stream.OpenElement(
"TimeSeriesColumn");
6661 time_series_columns(i).write_XML(file_stream);
6675 const Index rows_labels_number = rows_labels.dimension(0);
6677 file_stream.OpenElement(
"RowsLabels");
6681 for(Index i = 0; i < rows_labels_number; i++)
6683 buffer << rows_labels(i);
6685 if(i != rows_labels_number-1) buffer <<
",";
6688 file_stream.
PushText(buffer.str().c_str());
6695 file_stream.OpenElement(
"Samples");
6699 file_stream.OpenElement(
"SamplesNumber");
6702 buffer << get_samples_number();
6704 file_stream.
PushText(buffer.str().c_str());
6712 file_stream.OpenElement(
"SamplesUses");
6716 const Index samples_number = get_samples_number();
6718 for(Index i = 0; i < samples_number; i++)
6722 buffer << Index(sample_use);
6724 if(i < (samples_number-1)) buffer <<
" ";
6727 file_stream.
PushText(buffer.str().c_str());
6738 file_stream.OpenElement(
"MissingValues");
6743 file_stream.OpenElement(
"MissingValuesMethod");
6764 file_stream.OpenElement(
"MissingValuesNumber");
6769 file_stream.
PushText(buffer.str().c_str());
6778 file_stream.OpenElement(
"ColumnsMissingValuesNumber");
6780 const Index columns_number = columns_missing_values_number.size();
6784 for(Index i = 0; i < columns_number; i++)
6786 buffer << columns_missing_values_number(i);
6788 if(i != (columns_number-1)) buffer <<
" ";
6791 file_stream.
PushText(buffer.str().c_str());
6798 file_stream.OpenElement(
"RowsMissingValuesNumber");
6801 buffer << rows_missing_values_number;
6803 file_stream.
PushText(buffer.str().c_str());
6815 file_stream.OpenElement(
"PreviewData");
6817 file_stream.OpenElement(
"PreviewSize");
6820 buffer << data_file_preview.size();
6822 file_stream.
PushText(buffer.str().c_str());
6826 for(Index i = 0; i < data_file_preview.size(); i++)
6828 file_stream.OpenElement(
"Row");
6832 for(Index j = 0; j < data_file_preview(i).size(); j++)
6834 file_stream.
PushText(data_file_preview(i)(j).c_str());
6836 if(j != data_file_preview(i).size()-1)
6857 ostringstream buffer;
6863 if(!data_set_element)
6865 buffer <<
"OpenNN Exception: DataSet class.\n"
6866 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
6867 <<
"Data set element is nullptr.\n";
6869 throw logic_error(buffer.str());
6874 const tinyxml2::XMLElement* data_file_element = data_set_element->FirstChildElement(
"DataFile");
6876 if(!data_file_element)
6878 buffer <<
"OpenNN Exception: DataSet class.\n"
6879 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
6880 <<
"Data file element is nullptr.\n";
6882 throw logic_error(buffer.str());
6887 const tinyxml2::XMLElement* data_file_name_element = data_file_element->FirstChildElement(
"DataFileName");
6889 if(!data_file_name_element)
6891 buffer <<
"OpenNN Exception: DataSet class.\n"
6892 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
6893 <<
"DataFileName element is nullptr.\n";
6895 throw logic_error(buffer.str());
6898 if(data_file_name_element->GetText())
6900 const string new_data_file_name = data_file_name_element->GetText();
6907 const tinyxml2::XMLElement* separator_element = data_file_element->FirstChildElement(
"Separator");
6909 if(separator_element)
6911 if(separator_element->GetText())
6913 const string new_separator = separator_element->GetText();
6929 const tinyxml2::XMLElement* columns_names_element = data_file_element->FirstChildElement(
"ColumnsNames");
6931 if(columns_names_element)
6933 const string new_columns_names_string = columns_names_element->GetText();
6939 catch(
const logic_error& e)
6941 cerr << e.what() << endl;
6947 const tinyxml2::XMLElement* rows_label_element = data_file_element->FirstChildElement(
"RowsLabels");
6949 if(rows_label_element)
6951 const string new_rows_label_string = rows_label_element->GetText();
6957 catch(
const logic_error& e)
6959 cerr << e.what() << endl;
6965 const tinyxml2::XMLElement* missing_values_label_element = data_file_element->FirstChildElement(
"MissingValuesLabel");
6967 if(missing_values_label_element)
6969 if(missing_values_label_element->GetText())
6971 const string new_missing_values_label = missing_values_label_element->GetText();
6989 const tinyxml2::XMLElement* lags_number_element = data_file_element->FirstChildElement(
"LagsNumber");
6991 if(!lags_number_element)
6993 buffer <<
"OpenNN Exception: DataSet class.\n"
6994 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
6995 <<
"Lags number element is nullptr.\n";
6997 throw logic_error(buffer.str());
7000 if(lags_number_element->GetText())
7002 const Index new_lags_number =
static_cast<Index
>(atoi(lags_number_element->GetText()));
7009 const tinyxml2::XMLElement* steps_ahead_element = data_file_element->FirstChildElement(
"StepsAhead");
7011 if(!steps_ahead_element)
7013 buffer <<
"OpenNN Exception: DataSet class.\n"
7014 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7015 <<
"Steps ahead element is nullptr.\n";
7017 throw logic_error(buffer.str());
7020 if(steps_ahead_element->GetText())
7022 const Index new_steps_ahead =
static_cast<Index
>(atoi(steps_ahead_element->GetText()));
7029 const tinyxml2::XMLElement* time_column_element = data_file_element->FirstChildElement(
"TimeColumn");
7031 if(!time_column_element)
7033 buffer <<
"OpenNN Exception: DataSet class.\n"
7034 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7035 <<
"Time column element is nullptr.\n";
7037 throw logic_error(buffer.str());
7040 if(time_column_element->GetText())
7042 const string new_time_column = time_column_element->GetText();
7051 if(!columns_element)
7053 buffer <<
"OpenNN Exception: DataSet class.\n"
7054 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7055 <<
"Columns element is nullptr.\n";
7057 throw logic_error(buffer.str());
7062 const tinyxml2::XMLElement* columns_number_element = columns_element->FirstChildElement(
"ColumnsNumber");
7064 if(!columns_number_element)
7066 buffer <<
"OpenNN Exception: DataSet class.\n"
7067 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7068 <<
"Columns number element is nullptr.\n";
7070 throw logic_error(buffer.str());
7073 Index new_columns_number = 0;
7075 if(columns_number_element->GetText())
7077 new_columns_number =
static_cast<Index
>(atoi(columns_number_element->GetText()));
7086 if(new_columns_number > 0)
7088 for(Index i = 0; i < new_columns_number; i++)
7091 start_element = column_element;
7093 if(column_element->Attribute(
"Item") != to_string(i+1))
7095 buffer <<
"OpenNN Exception: DataSet class.\n"
7096 <<
"void DataSet:from_XML(const tinyxml2::XMLDocument&) method.\n"
7097 <<
"Column item number (" << i+1 <<
") does not match (" << column_element->Attribute(
"Item") <<
").\n";
7099 throw logic_error(buffer.str());
7108 buffer <<
"OpenNN Exception: DataSet class.\n"
7109 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7110 <<
"Name element is nullptr.\n";
7112 throw logic_error(buffer.str());
7115 if(name_element->GetText())
7117 const string new_name = name_element->GetText();
7119 columns(i).name = new_name;
7128 buffer <<
"OpenNN Exception: DataSet class.\n"
7129 <<
"void DataSet::from_XML(const tinyxml2::XMLDocument&) method.\n"
7130 <<
"Scaler element is nullptr.\n";
7132 throw logic_error(buffer.str());
7135 if(scaler_element->GetText())
7137 const string new_scaler = scaler_element->GetText();
7139 columns(i).set_scaler(new_scaler);
7144 const tinyxml2::XMLElement* column_use_element = column_element->FirstChildElement(
"ColumnUse");
7146 if(!column_use_element)
7148 buffer <<
"OpenNN Exception: DataSet class.\n"
7149 <<
"void DataSet::from_XML(const tinyxml2::XMLDocument&) method.\n"
7150 <<
"Column use element is nullptr.\n";
7152 throw logic_error(buffer.str());
7155 if(column_use_element->GetText())
7157 const string new_column_use = column_use_element->GetText();
7159 columns(i).set_use(new_column_use);
7168 buffer <<
"OpenNN Exception: DataSet class.\n"
7169 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7170 <<
"Type element is nullptr.\n";
7172 throw logic_error(buffer.str());
7175 if(type_element->GetText())
7177 const string new_type = type_element->GetText();
7178 columns(i).set_type(new_type);
7181 if(columns(i).type == ColumnType::Categorical || columns(i).type == ColumnType::Binary)
7185 const tinyxml2::XMLElement* categories_element = column_element->FirstChildElement(
"Categories");
7187 if(!categories_element)
7189 buffer <<
"OpenNN Exception: DataSet class.\n"
7190 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7191 <<
"Categories element is nullptr.\n";
7193 throw logic_error(buffer.str());
7196 if(categories_element->GetText())
7198 const string new_categories = categories_element->GetText();
7200 columns(i).categories = get_tokens(new_categories,
';');
7205 const tinyxml2::XMLElement* categories_uses_element = column_element->FirstChildElement(
"CategoriesUses");
7207 if(!categories_uses_element)
7209 buffer <<
"OpenNN Exception: DataSet class.\n"
7210 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7211 <<
"Categories uses element is nullptr.\n";
7213 throw logic_error(buffer.str());
7216 if(categories_uses_element->GetText())
7218 const string new_categories_uses = categories_uses_element->GetText();
7220 columns(i).set_categories_uses(get_tokens(new_categories_uses,
';'));
7229 const tinyxml2::XMLElement* time_series_columns_element = data_set_element->FirstChildElement(
"TimeSeriesColumns");
7231 if(!time_series_columns_element)
7245 const tinyxml2::XMLElement* time_series_columns_number_element = time_series_columns_element->FirstChildElement(
"TimeSeriesColumnsNumber");
7247 if(!time_series_columns_number_element)
7249 buffer <<
"OpenNN Exception: DataSet class.\n"
7250 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7251 <<
"Time seires columns number element is nullptr.\n";
7253 throw logic_error(buffer.str());
7256 Index time_series_new_columns_number = 0;
7258 if(time_series_columns_number_element->GetText())
7260 time_series_new_columns_number =
static_cast<Index
>(atoi(time_series_columns_number_element->GetText()));
7262 set_time_series_columns_number(time_series_new_columns_number);
7269 if(time_series_new_columns_number > 0)
7271 for(Index i = 0; i < time_series_new_columns_number; i++)
7274 time_series_start_element = time_series_column_element;
7276 if(time_series_column_element->Attribute(
"Item") != to_string(i+1))
7278 buffer <<
"OpenNN Exception: DataSet class.\n"
7279 <<
"void DataSet:from_XML(const tinyxml2::XMLDocument&) method.\n"
7280 <<
"Time series column item number (" << i+1 <<
") does not match (" << time_series_column_element->Attribute(
"Item") <<
").\n";
7282 throw logic_error(buffer.str());
7287 const tinyxml2::XMLElement* time_series_name_element = time_series_column_element->FirstChildElement(
"Name");
7289 if(!time_series_name_element)
7291 buffer <<
"OpenNN Exception: DataSet class.\n"
7292 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7293 <<
"Time series name element is nullptr.\n";
7295 throw logic_error(buffer.str());
7298 if(time_series_name_element->GetText())
7300 const string time_series_new_name = time_series_name_element->GetText();
7302 time_series_columns(i).name = time_series_new_name;
7307 const tinyxml2::XMLElement* time_series_scaler_element = time_series_column_element->FirstChildElement(
"Scaler");
7309 if(!time_series_scaler_element)
7311 buffer <<
"OpenNN Exception: DataSet class.\n"
7312 <<
"void DataSet::from_XML(const tinyxml2::XMLDocument&) method.\n"
7313 <<
"Time series scaler element is nullptr.\n";
7315 throw logic_error(buffer.str());
7318 if(time_series_scaler_element->GetText())
7320 const string time_series_new_scaler = time_series_scaler_element->GetText();
7322 time_series_columns(i).set_scaler(time_series_new_scaler);
7327 const tinyxml2::XMLElement* time_series_column_use_element = time_series_column_element->FirstChildElement(
"ColumnUse");
7329 if(!time_series_column_use_element)
7331 buffer <<
"OpenNN Exception: DataSet class.\n"
7332 <<
"void DataSet::from_XML(const tinyxml2::XMLDocument&) method.\n"
7333 <<
"Time series column use element is nullptr.\n";
7335 throw logic_error(buffer.str());
7338 if(time_series_column_use_element->GetText())
7340 const string time_series_new_column_use = time_series_column_use_element->GetText();
7342 time_series_columns(i).set_use(time_series_new_column_use);
7347 const tinyxml2::XMLElement* time_series_type_element = time_series_column_element->FirstChildElement(
"Type");
7349 if(!time_series_type_element)
7351 buffer <<
"OpenNN Exception: DataSet class.\n"
7352 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7353 <<
"Time series type element is nullptr.\n";
7355 throw logic_error(buffer.str());
7358 if(time_series_type_element->GetText())
7360 const string time_series_new_type = time_series_type_element->GetText();
7361 time_series_columns(i).set_type(time_series_new_type);
7364 if(time_series_columns(i).type == ColumnType::Categorical || time_series_columns(i).type == ColumnType::Binary)
7368 const tinyxml2::XMLElement* time_series_categories_element = time_series_column_element->FirstChildElement(
"Categories");
7370 if(!time_series_categories_element)
7372 buffer <<
"OpenNN Exception: DataSet class.\n"
7373 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7374 <<
"Time series categories element is nullptr.\n";
7376 throw logic_error(buffer.str());
7379 if(time_series_categories_element->GetText())
7381 const string time_series_new_categories = time_series_categories_element->GetText();
7383 time_series_columns(i).categories = get_tokens(time_series_new_categories,
';');
7388 const tinyxml2::XMLElement* time_series_categories_uses_element = time_series_column_element->FirstChildElement(
"CategoriesUses");
7390 if(!time_series_categories_uses_element)
7392 buffer <<
"OpenNN Exception: DataSet class.\n"
7393 <<
"void Column::from_XML(const tinyxml2::XMLDocument&) method.\n"
7394 <<
"Time series categories uses element is nullptr.\n";
7396 throw logic_error(buffer.str());
7399 if(time_series_categories_uses_element->GetText())
7401 const string time_series_new_categories_uses = time_series_categories_uses_element->GetText();
7403 time_series_columns(i).set_categories_uses(get_tokens(time_series_new_categories_uses,
';'));
7416 const tinyxml2::XMLElement* rows_labels_element = data_set_element->FirstChildElement(
"RowsLabels");
7418 if(!rows_labels_element)
7420 buffer <<
"OpenNN Exception: DataSet class.\n"
7421 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7422 <<
"Rows labels element is nullptr.\n";
7424 throw logic_error(buffer.str());
7429 if(rows_labels_element->GetText())
7431 const string new_rows_labels = rows_labels_element->GetText();
7433 rows_labels = get_tokens(new_rows_labels,
',');
7441 if(!samples_element)
7443 buffer <<
"OpenNN Exception: DataSet class.\n"
7444 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7445 <<
"Samples element is nullptr.\n";
7447 throw logic_error(buffer.str());
7452 const tinyxml2::XMLElement* samples_number_element = samples_element->FirstChildElement(
"SamplesNumber");
7454 if(!samples_number_element)
7456 buffer <<
"OpenNN Exception: DataSet class.\n"
7457 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7458 <<
"Samples number element is nullptr.\n";
7460 throw logic_error(buffer.str());
7463 if(samples_number_element->GetText())
7465 const Index new_samples_number =
static_cast<Index
>(atoi(samples_number_element->GetText()));
7467 samples_uses.resize(new_samples_number);
7472 const tinyxml2::XMLElement* samples_uses_element = samples_element->FirstChildElement(
"SamplesUses");
7474 if(!samples_uses_element)
7476 buffer <<
"OpenNN Exception: DataSet class.\n"
7477 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7478 <<
"Samples uses element is nullptr.\n";
7480 throw logic_error(buffer.str());
7483 if(samples_uses_element->GetText())
7490 const tinyxml2::XMLElement* missing_values_element = data_set_element->FirstChildElement(
"MissingValues");
7492 if(!missing_values_element)
7494 buffer <<
"OpenNN Exception: DataSet class.\n"
7495 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7496 <<
"Missing values element is nullptr.\n";
7498 throw logic_error(buffer.str());
7503 const tinyxml2::XMLElement* missing_values_method_element = missing_values_element->FirstChildElement(
"MissingValuesMethod");
7505 if(!missing_values_method_element)
7507 buffer <<
"OpenNN Exception: DataSet class.\n"
7508 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7509 <<
"Missing values method element is nullptr.\n";
7511 throw logic_error(buffer.str());
7514 if(missing_values_method_element->GetText())
7521 const tinyxml2::XMLElement* missing_values_number_element = missing_values_element->FirstChildElement(
"MissingValuesNumber");
7523 if(!missing_values_number_element)
7525 buffer <<
"OpenNN Exception: DataSet class.\n"
7526 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7527 <<
"Missing values number element is nullptr.\n";
7529 throw logic_error(buffer.str());
7532 if(missing_values_number_element->GetText())
7541 const tinyxml2::XMLElement* columns_missing_values_number_element = missing_values_element->FirstChildElement(
"ColumnsMissingValuesNumber");
7543 if(!columns_missing_values_number_element)
7545 buffer <<
"OpenNN Exception: DataSet class.\n"
7546 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7547 <<
"Columns missing values number element is nullptr.\n";
7549 throw logic_error(buffer.str());
7552 if(columns_missing_values_number_element->GetText())
7554 Tensor<string, 1> new_columns_missing_values_number = get_tokens(columns_missing_values_number_element->GetText(),
' ');
7556 columns_missing_values_number.resize(new_columns_missing_values_number.size());
7558 for(Index i = 0; i < new_columns_missing_values_number.size(); i++)
7560 columns_missing_values_number(i) = atoi(new_columns_missing_values_number(i).c_str());
7566 const tinyxml2::XMLElement* rows_missing_values_number_element = missing_values_element->FirstChildElement(
"RowsMissingValuesNumber");
7568 if(!rows_missing_values_number_element)
7570 buffer <<
"OpenNN Exception: DataSet class.\n"
7571 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7572 <<
"Rows missing values number element is nullptr.\n";
7574 throw logic_error(buffer.str());
7577 if(rows_missing_values_number_element->GetText())
7579 rows_missing_values_number =
static_cast<Index
>(atoi(rows_missing_values_number_element->GetText()));
7585 const tinyxml2::XMLElement* preview_data_element = data_set_element->FirstChildElement(
"PreviewData");
7587 if(!preview_data_element)
7589 buffer <<
"OpenNN Exception: DataSet class.\n"
7590 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7591 <<
"Preview data element is nullptr.\n";
7593 throw logic_error(buffer.str());
7598 const tinyxml2::XMLElement* preview_size_element = preview_data_element->FirstChildElement(
"PreviewSize");
7600 if(!preview_size_element)
7602 buffer <<
"OpenNN Exception: DataSet class.\n"
7603 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7604 <<
"Preview size element is nullptr.\n";
7606 throw logic_error(buffer.str());
7609 Index new_preview_size = 0;
7611 if(preview_size_element->GetText())
7613 new_preview_size =
static_cast<Index
>(atoi(preview_size_element->GetText()));
7615 if(new_preview_size > 0) data_file_preview.resize(new_preview_size);
7620 start_element = preview_size_element;
7622 for(Index i = 0; i < new_preview_size; i++)
7625 start_element = row_element;
7627 if(row_element->Attribute(
"Item") != to_string(i+1))
7629 buffer <<
"OpenNN Exception: DataSet class.\n"
7630 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
7631 <<
"Row item number (" << i+1 <<
") does not match (" << row_element->Attribute(
"Item") <<
").\n";
7633 throw logic_error(buffer.str());
7636 if(row_element->GetText())
7638 data_file_preview(i) = get_tokens(row_element->GetText(),
',');
7648 const string new_display_string = display_element->GetText();
7654 catch(
const logic_error& e)
7656 cerr << e.what() << endl;
7669 const Index samples_number = get_samples_number();
7671 cout <<
"Data set object summary:\n"
7672 <<
"Number of variables: " << variables_number <<
"\n"
7673 <<
"Number of samples: " << samples_number <<
"\n";
7683 FILE* pFile = fopen(file_name.c_str(),
"w");
7720 if(document.LoadFile(file_name.c_str()))
7722 ostringstream buffer;
7724 buffer <<
"OpenNN Exception: DataSet class.\n"
7725 <<
"void load(const string&) method.\n"
7726 <<
"Cannot load XML file " << file_name <<
".\n";
7728 throw logic_error(buffer.str());
7735void DataSet::print_columns()
const
7739 for(Index i = 0; i < columns_number; i++)
7749void DataSet::print_columns_types()
const
7753 for(Index i = 0; i < columns_number; i++)
7755 if(columns(i).type == ColumnType::Numeric) cout <<
"Numeric ";
7756 else if(columns(i).type == ColumnType::Binary) cout <<
"Binary ";
7757 else if(columns(i).type == ColumnType::Categorical) cout <<
"Categorical ";
7758 else if(columns(i).type == ColumnType::DateTime) cout <<
"DateTime ";
7759 else if(columns(i).type == ColumnType::Constant) cout <<
"Constant ";
7766void DataSet::print_columns_uses()
const
7770 for(Index i = 0; i < columns_number; i++)
7772 if(columns(i).column_use == VariableUse::Input) cout <<
"Input ";
7773 else if(columns(i).column_use == VariableUse::Target) cout <<
"Target ";
7774 else if(columns(i).column_use == VariableUse::UnusedVariable) cout <<
"Unused ";
7796 const Index samples_number = get_samples_number();
7798 if(samples_number > 0)
7800 const Tensor<type, 1> first_sample =
data.chip(0, 0);
7802 cout <<
"First sample: \n";
7804 for(
int i = 0; i< first_sample.dimension(0); i++)
7806 cout << first_sample(i) <<
" ";
7812 if(samples_number > 1)
7814 const Tensor<type, 1> second_sample =
data.chip(1, 0);
7816 cout <<
"Second sample: \n";
7818 for(
int i = 0; i< second_sample.dimension(0); i++)
7820 cout << second_sample(i) <<
" ";
7826 if(samples_number > 2)
7828 const Tensor<type, 1> last_sample =
data.chip(samples_number-1, 0);
7830 cout <<
"Last sample: \n";
7832 for(
int i = 0; i< last_sample.dimension(0); i++)
7834 cout << last_sample(i) <<
" ";
7850 ostringstream buffer;
7852 buffer <<
"OpenNN Exception: Matrix template." << endl
7853 <<
"void save_csv(const string&, const char&, const Vector<string>&, const Vector<string>&) method." << endl
7856 throw logic_error(buffer.str());
7861 const Index samples_number = get_samples_number();
7866 char separator_char =
',';
7870 file <<
"id" << separator_char;
7872 for(Index j = 0; j < variables_number; j++)
7874 file << variables_names[j];
7876 if(j != variables_number-1)
7878 file << separator_char;
7884 for(Index i = 0; i < samples_number; i++)
7888 file << rows_labels(i) << separator_char;
7890 for(Index j = 0; j < variables_number; j++)
7894 if(j != variables_number-1)
7896 file << separator_char;
7911 ofstream file(binary_data_file_name.c_str(), ios::binary);
7915 ostringstream buffer;
7917 buffer <<
"OpenNN Exception: DataSet class." << endl
7918 <<
"void save_data_binary() method." << endl
7919 <<
"Cannot open data binary file." << endl;
7921 throw logic_error(buffer.str());
7926 streamsize size =
sizeof(Index);
7928 Index columns_number =
data.dimension(1);
7929 Index rows_number =
data.dimension(0);
7931 cout <<
"Saving binary data file..." << endl;
7933 file.write(
reinterpret_cast<char*
>(&columns_number), size);
7934 file.write(
reinterpret_cast<char*
>(&rows_number), size);
7936 size =
sizeof(type);
7940 for(
int i = 0; i < columns_number; i++)
7942 for(
int j = 0; j < rows_number; j++)
7946 file.write(
reinterpret_cast<char*
>(&value), size);
7952 cout <<
"Binary data file saved." << endl;
7960 ofstream file(binary_data_file_name.c_str(), ios::binary);
7964 ostringstream buffer;
7966 buffer <<
"OpenNN Exception: DataSet class." << endl
7967 <<
"void save_time_series_data_binary(const string) method." << endl
7968 <<
"Cannot open data binary file." << endl;
7970 throw logic_error(buffer.str());
7975 streamsize size =
sizeof(Index);
7980 cout <<
"Saving binary data file..." << endl;
7982 file.write(
reinterpret_cast<char*
>(&columns_number), size);
7983 file.write(
reinterpret_cast<char*
>(&rows_number), size);
7985 size =
sizeof(type);
7989 for(
int i = 0; i < columns_number; i++)
7991 for(
int j = 0; j < rows_number; j++)
7995 file.write(
reinterpret_cast<char*
>(&value), size);
8001 cout <<
"Binary data file saved." << endl;
8011 transform_time_series_data();
8031 ostringstream buffer;
8033 buffer <<
"OpenNN Exception: DataSet class.\n"
8034 <<
"void load_binary() method.\n"
8037 throw logic_error(buffer.str());
8040 streamsize size =
sizeof(Index);
8042 Index columns_number;
8045 file.read(
reinterpret_cast<char*
>(&columns_number), size);
8046 file.read(
reinterpret_cast<char*
>(&rows_number), size);
8048 size =
sizeof(type);
8052 data.resize(rows_number, columns_number);
8054 for(Index i = 0; i < rows_number*columns_number; i++)
8056 file.read(
reinterpret_cast<char*
>(&value), size);
8071 file.open(time_series_data_file_name.c_str(), ios::binary);
8075 ostringstream buffer;
8077 buffer <<
"OpenNN Exception: DataSet class.\n"
8078 <<
"void load_time_series_data_binary(const string&) method.\n"
8079 <<
"Cannot open binary file: " << time_series_data_file_name <<
"\n";
8081 throw logic_error(buffer.str());
8084 streamsize size =
sizeof(Index);
8086 Index columns_number;
8089 file.read(
reinterpret_cast<char*
>(&columns_number), size);
8090 file.read(
reinterpret_cast<char*
>(&rows_number), size);
8092 size =
sizeof(type);
8098 for(Index i = 0; i < rows_number*columns_number; i++)
8100 file.read(
reinterpret_cast<char*
>(&value), size);
8113 ifstream file(input_data_file_name.c_str());
8117 ostringstream buffer;
8119 buffer <<
"OpenNN Exception: DataSet class.\n"
8120 <<
"void check_input_csv() method.\n"
8121 <<
"Cannot open input data file: " << input_data_file_name <<
"\n";
8123 throw logic_error(buffer.str());
8127 Index line_number = 0;
8128 Index total_lines = 0;
8138 getline(file, line);
8144 if(line.empty())
continue;
8148 tokens_count = count_tokens(line, separator_char);
8150 if(tokens_count != columns_number)
8152 ostringstream buffer;
8154 buffer <<
"OpenNN Exception: DataSet class.\n"
8155 <<
"void check_input_csv() method.\n"
8156 <<
"Line " << line_number <<
": Size of tokens in input file ("
8157 << tokens_count <<
") is not equal to number of columns("
8158 << columns_number <<
"). \n"
8159 <<
"Input csv must contain values for all the variables except the target. \n";
8161 throw logic_error(buffer.str());
8167 if(total_lines == 0)
8169 ostringstream buffer;
8171 buffer <<
"OpenNN Exception: DataSet class.\n"
8172 <<
"void check_input_csv() method.\n"
8173 <<
"Input data file is empty. \n";
8175 throw logic_error(buffer.str());
8183 const char& separator_char,
8185 const bool& has_columns_name,
8186 const bool& has_rows_label)
const
8188 ifstream file(input_data_file_name.c_str());
8192 ostringstream buffer;
8194 buffer <<
"OpenNN Exception: DataSet class.\n"
8195 <<
"void read_input_csv() method.\n"
8196 <<
"Cannot open input data file: " << input_data_file_name <<
"\n";
8198 throw logic_error(buffer.str());
8203 Index input_samples_count = 0;
8206 Index line_number = 0;
8216 getline(file, line);
8222 if(line.empty())
continue;
8224 tokens_count = count_tokens(line, separator_char);
8226 if(tokens_count != columns_number)
8228 ostringstream buffer;
8230 buffer <<
"OpenNN Exception: DataSet class.\n"
8231 <<
"void read_input_csv() method.\n"
8232 <<
"Line " << line_number <<
": Size of tokens("
8233 << tokens_count <<
") is not equal to number of columns("
8234 << columns_number <<
").\n";
8236 throw logic_error(buffer.str());
8239 input_samples_count++;
8246 if(has_columns_name) input_samples_count--;
8248 Tensor<type, 2> input_data(input_samples_count, variables_number);
8252 file.open(input_data_file_name.c_str());
8256 ostringstream buffer;
8258 buffer <<
"OpenNN Exception: DataSet class.\n"
8259 <<
"void read_input_csv() method.\n"
8260 <<
"Cannot open input data file: " << input_data_file_name <<
" for filling input data file. \n";
8262 throw logic_error(buffer.str());
8267 if(has_columns_name)
8271 getline(file, line);
8273 if(line.empty())
continue;
8281 Tensor<string, 1> tokens;
8284 Index variable_index = 0;
8285 Index token_index = 0;
8286 bool is_ID = has_rows_label;
8288 const bool is_float = is_same<type, float>::value;
8289 bool has_missing_values =
false;
8293 getline(file, line);
8299 if(line.empty())
continue;
8301 tokens = get_tokens(line, separator_char);
8305 is_ID = has_rows_label;
8307 for(Index i = 0; i < columns.size(); i++)
8315 if(columns(i).column_use == VariableUse::UnusedVariable)
8320 else if(columns(i).column_use != VariableUse::Input)
8325 if(columns(i).type == ColumnType::Numeric)
8329 has_missing_values =
true;
8330 input_data(line_number, variable_index) =
static_cast<type
>(NAN);
8334 input_data(line_number, variable_index) = type(strtof(tokens(token_index).
data(), NULL));
8338 input_data(line_number, variable_index) = type(stof(tokens(token_index)));
8343 else if(columns(i).type == ColumnType::Binary)
8347 has_missing_values =
true;
8348 input_data(line_number, variable_index) =
static_cast<type
>(NAN);
8350 else if(columns(i).categories.size() > 0 && tokens(token_index) == columns(i).categories(0))
8352 input_data(line_number, variable_index) = type(1);
8354 else if(tokens(token_index) == columns(i).name)
8356 input_data(line_number, variable_index) = type(1);
8361 else if(columns(i).type == ColumnType::Categorical)
8363 for(Index k = 0; k < columns(i).get_categories_number(); k++)
8367 has_missing_values =
true;
8368 input_data(line_number, variable_index) =
static_cast<type
>(NAN);
8370 else if(tokens(token_index) == columns(i).categories(k))
8372 input_data(line_number, variable_index) = type(1);
8378 else if(columns(i).type == ColumnType::DateTime)
8382 has_missing_values =
true;
8383 input_data(line_number, variable_index) =
static_cast<type
>(NAN);
8387 input_data(line_number, variable_index) =
static_cast<type
>(date_to_timestamp(tokens(token_index), gmt));
8392 else if(columns(i).type == ColumnType::Constant)
8396 has_missing_values =
true;
8397 input_data(line_number, variable_index) =
static_cast<type
>(NAN);
8401 input_data(line_number, variable_index) = type(strtof(tokens(token_index).
data(), NULL));
8405 input_data(line_number, variable_index) = type(stof(tokens(token_index)));
8419 if(!has_missing_values)
8431 const Tensor<type, 1> means = mean(input_data);
8433 const Index samples_number = input_data.dimension(0);
8434 const Index variables_number = input_data.dimension(1);
8436 #pragma omp parallel for schedule(dynamic)
8438 for(Index j = 0; j < variables_number; j++)
8440 for(Index i = 0; i < samples_number; i++)
8442 if(
isnan(input_data(i, j)))
8444 input_data(i,j) = means(j);
8451 const Tensor<type, 1> medians = median(input_data);
8453 const Index samples_number = input_data.dimension(0);
8454 const Index variables_number = input_data.dimension(1);
8456 #pragma omp parallel for schedule(dynamic)
8458 for(Index j = 0; j < variables_number; j++)
8460 for(Index i = 0; i < samples_number; i++)
8462 if(
isnan(input_data(i, j)))
8464 input_data(i,j) = medians(j);
8482 const Index samples_number = get_samples_number();
8486 Tensor<Index, 1> class_distribution;
8488 if(targets_number == 1)
8490 class_distribution = Tensor<Index, 1>(2);
8492 Index target_index = target_variables_indices(0);
8494 Index positives = 0;
8495 Index negatives = 0;
8497 for(Index sample_index = 0; sample_index < static_cast<Index>(samples_number); sample_index++)
8499 if(!
isnan(
data(
static_cast<Index
>(sample_index),target_index)))
8501 if(
data(
static_cast<Index
>(sample_index), target_index) <
static_cast<type
>(0.5))
8512 class_distribution(0) = negatives;
8513 class_distribution(1) = positives;
8517 class_distribution = Tensor<Index, 1>(targets_number);
8519 for(Index i = 0; i < samples_number; i++)
8523 for(Index j = 0; j < targets_number; j++)
8525 if(
data(i,target_variables_indices(j)) ==
static_cast<type
>(NAN))
continue;
8527 if(
data(i,target_variables_indices(j)) > type(0.5)) class_distribution(j)++;
8533 return class_distribution;
8549 Tensor<Tensor<Index, 1>, 1> return_values(2);
8551 return_values(0) = Tensor<Index, 1>(samples_number);
8552 return_values(1) = Tensor<Index, 1>(used_columns_number);
8554 return_values(0).setZero();
8555 return_values(1).setZero();
8559 Index used_column_index = 0;
8560 Index variable_index = 0;
8562 #pragma omp parallel for
8564 for(Index i = 0; i < columns_number; i++)
8566 if(columns(i).column_use == VariableUse::UnusedVariable && columns(i).type == ColumnType::Categorical)
8568 variable_index += columns(i).get_categories_number();
8571 else if(columns(i).column_use == VariableUse::UnusedVariable)
8577 if(columns(i).type == ColumnType::Categorical || columns(i).type == ColumnType::Binary || columns(i).type == ColumnType::DateTime)
8579 used_column_index++;
8580 columns(i).get_categories_number() == 0 ? variable_index++ : variable_index += columns(i).get_categories_number();
8585 const type interquartile_range = box_plots(used_column_index).third_quartile - box_plots(used_column_index).first_quartile;
8587 if(interquartile_range < numeric_limits<type>::epsilon())
8589 used_column_index++;
8594 Index columns_outliers = 0;
8596 for(Index j = 0; j < samples_number; j++)
8598 const Tensor<type, 1> sample =
get_sample_data(samples_indices(
static_cast<Index
>(j)));
8600 if(sample(variable_index) <(box_plots(used_column_index).first_quartile - cleaning_parameter*interquartile_range) ||
8601 sample(variable_index) >(box_plots(used_column_index).third_quartile + cleaning_parameter*interquartile_range))
8603 return_values(0)(
static_cast<Index
>(j)) = 1;
8609 return_values(1)(used_column_index) = columns_outliers;
8611 used_column_index++;
8616 return return_values;
8634Tensor<Index, 1> DataSet::select_outliers_via_contamination(
const Tensor<type, 1>& outlier_ranks,
8635 const type & contamination,
8640 Tensor<Tensor<type, 1>, 1> ordered_ranks(samples_number);
8642 Tensor<Index, 1> outlier_indexes(samples_number);
8643 outlier_indexes.setZero();
8645 for(Index i = 0; i < samples_number; i++)
8647 ordered_ranks(i) = Tensor<type, 1>(2);
8648 ordered_ranks(i)(0) = type(i);
8649 ordered_ranks(i)(1) = outlier_ranks(i);
8652 sort(ordered_ranks.data(), ordered_ranks.data() + samples_number,
8653 [](Tensor<type, 1> & a, Tensor<type, 1> & b) ->
bool
8660 for(Index i = Index((type(1) - contamination)*type(samples_number)); i < samples_number; i++)
8661 outlier_indexes(
static_cast<Index
>(ordered_ranks(i)(0))) = 1;
8665 for(Index i = 0; i < Index(contamination*type(samples_number)); i++)
8666 outlier_indexes(
static_cast<Index
>(ordered_ranks(i)(0))) = 1;
8669 return outlier_indexes;
8673Tensor<Index, 1> DataSet::select_outliers_via_standard_deviation(
const Tensor<type, 1>& outlier_ranks,
8674 const type & deviation_factor,
8678 const type mean_ranks = mean(outlier_ranks);
8679 const type std_ranks = standard_deviation(outlier_ranks);
8681 Tensor<Index, 1> outlier_indexes(samples_number);
8682 outlier_indexes.setZero();
8687 for(Index i = 0; i < samples_number; i++)
8689 if(outlier_ranks(i) > mean_ranks + deviation_factor*std_ranks)
8690 outlier_indexes(i) = 1;
8695 for(Index i = 0; i < samples_number; i++)
8697 if(outlier_ranks(i) < mean_ranks - deviation_factor*std_ranks)
8698 outlier_indexes(i) = 1;
8703 return outlier_indexes;
8707type DataSet::calculate_euclidean_distance(
const Tensor<Index, 1>& variables_indices,
8708 const Index& sample_index,
8709 const Index& other_sample_index)
const
8711 const Index input_variables_number = variables_indices.size();
8713 type distance = type(0);
8716 for(Index i = 0; i < input_variables_number; i++)
8718 error =
data(sample_index, variables_indices(i)) -
data(other_sample_index, variables_indices(i));
8720 distance += error*error;
8723 return sqrt(distance);
8727Tensor<type, 2> DataSet::calculate_distance_matrix(
const Tensor<Index,1>& indices)
const
8729 const Index samples_number = indices.size();
8733 Tensor<type, 2> distance_matrix(samples_number, samples_number);
8734 distance_matrix.setZero();
8736 #pragma omp parallel for
8738 for(Index i = 0; i < samples_number ; i++)
8740 for(Index k = 0; k < i; k++)
8742 distance_matrix(i,k)
8743 = distance_matrix(k,i)
8744 = calculate_euclidean_distance(input_variables_indices, indices(i), indices(k));
8747 return distance_matrix;
8751Tensor<list<Index>, 1> DataSet::calculate_k_nearest_neighbors(
const Tensor<type, 2>& distance_matrix,
const Index& k_neighbors)
const
8753 const Index samples_number = distance_matrix.dimensions()[0];
8755 Tensor<list<Index>, 1> neighbors_indices(samples_number);
8757 #pragma omp parallel for
8759 for(Index i = 0; i < samples_number; i++)
8761 list<type> min_distances(k_neighbors, numeric_limits<type>::max());
8763 neighbors_indices(i) = list<Index>(k_neighbors, 0);
8765 for(Index j = 0; j < samples_number; j++)
8767 if(j == i)
continue;
8769 list<Index>::iterator neighbor_it = neighbors_indices(i).begin();
8770 list<type>::iterator current_min = min_distances.begin();
8772 for(Index k = 0; k < k_neighbors; k++, current_min++, neighbor_it++)
8774 if(distance_matrix(i,j) < *current_min)
8776 neighbors_indices(i).insert(neighbor_it, j);
8778 min_distances.insert(current_min, distance_matrix(i,j));
8785 neighbors_indices(i).resize(k_neighbors);
8788 return neighbors_indices;
8792Tensor<Tensor<type, 1>, 1> DataSet::get_kd_tree_data()
const
8800 Tensor<Tensor<type, 1>, 1> kd_tree_data(used_samples_number);
8802 for(Index i = 0; i < used_samples_number; i++)
8804 kd_tree_data(i) = Tensor<type, 1>(input_variables_number+1);
8806 kd_tree_data(i)(0) = type(used_samples_indices(i));
8808 for(Index j = 0; j < input_variables_number; j++)
8809 kd_tree_data(i)(j+1) =
data(used_samples_indices(i), input_variables_indices(j));
8812 return kd_tree_data;
8816Tensor<Tensor<Index, 1>, 1> DataSet::create_bounding_limits_kd_tree(
const Index& depth)
const
8819 Tensor<Tensor<Index, 1>, 1> bounding_limits(depth+1);
8821 bounding_limits(0) = Tensor<Index, 1>(2);
8822 bounding_limits(0)(0) = 0;
8826 for(Index i = 1; i <= depth; i++)
8828 bounding_limits(i) = Tensor<Index, 1>(
pow(2, i)+1);
8829 bounding_limits(i)(0) = 0;
8831 for(Index j = 1; j < bounding_limits(i).size()-1; j = j+2)
8833 bounding_limits(i)(j) = (bounding_limits(i-1)(j/2+1) - bounding_limits(i-1)(j/2))/2
8834 + bounding_limits(i-1)(j/2);
8836 bounding_limits(i)(j+1) = bounding_limits(i-1)(j/2+1);
8839 return bounding_limits;
8843void DataSet::create_kd_tree(Tensor<Tensor<type, 1>, 1>& tree,
const Tensor<Tensor<Index, 1>, 1>& bounding_limits)
const
8845 const Index depth = bounding_limits.size()-1;
8846 const Index input_variables = tree(0).size();
8848 auto specific_sort = [&tree](
const Index & first,
const Index & last,
const Index & split_variable)
8850 sort(tree.data() + first, tree.data() + last,
8851 [&split_variable](
const Tensor<type, 1> & a,
const Tensor<type, 1> & b) ->
bool
8853 return a(split_variable) > b(split_variable);
8857 specific_sort(bounding_limits(0)(0), bounding_limits(0)(1), 1);
8859 Index split_variable = 2;
8861 for(Index i = 1; i <= depth; i++, split_variable++)
8863 split_variable = max(split_variable % input_variables,
static_cast<Index
>(1));
8865 specific_sort(bounding_limits(i)(0), bounding_limits(i)(1), split_variable);
8867 #pragma omp parallel for
8868 for(Index j = 1; j < (Index)bounding_limits(i).size()-1; j++)
8869 specific_sort(bounding_limits(i)(j)+1, bounding_limits(i)(j+1), split_variable);
8874Tensor<list<Index>, 1> DataSet::calculate_bounding_boxes_neighbors(
const Tensor<Tensor<type, 1>, 1>& tree,
8875 const Tensor<Index, 1>& leaves_indices,
8877 const Index& k_neighbors)
const
8911 return Tensor<list<Index>, 1>();
8915Tensor<list<Index>, 1> DataSet::calculate_kd_tree_neighbors(
const Index& k_neighbors,
const Index& min_samples_leaf)
const
8931 return Tensor<list<Index>, 1>();
8935Tensor<type, 1> DataSet::calculate_average_reachability(Tensor<list<Index>, 1>& k_nearest_indexes,
8936 const Index& k)
const
8942 Tensor<type, 1> average_reachability(samples_number);
8943 average_reachability.setZero();
8945 #pragma omp parallel for
8947 for(Index i = 0; i < samples_number; i++)
8949 list<Index>::iterator neighbor_it = k_nearest_indexes(i).begin();
8951 type distance_between_points;
8952 type distance_2_k_neighbor;
8954 for(Index j = 0; j < k; j++, neighbor_it++)
8956 const Index neighbor_k_index = k_nearest_indexes(*neighbor_it).back();
8958 distance_between_points = calculate_euclidean_distance(input_variables_indices, i, *neighbor_it);
8959 distance_2_k_neighbor = calculate_euclidean_distance(input_variables_indices, *neighbor_it, neighbor_k_index);
8961 average_reachability(i) += max(distance_between_points, distance_2_k_neighbor);
8964 average_reachability(i) /= type(k);
8967 return average_reachability;
8971Tensor<type, 1> DataSet::calculate_local_outlier_factor(Tensor<list<Index>, 1>& k_nearest_indexes,
8972 const Tensor<type, 1>& average_reachabilities,
8973 const Index & k)
const
8976 Tensor<type, 1> LOF_value(samples_number);
8978 #pragma omp parallel for
8980 for(Index i = 0; i < samples_number; i++)
8984 for(
auto & neighbor_index : k_nearest_indexes(i))
8985 sum += average_reachabilities(i) / average_reachabilities(neighbor_index);
8987 LOF_value(i) = sum/k ;
9003 const Index& min_samples_leaf,
9004 const type& contamination)
const
9008 ostringstream buffer;
9010 buffer <<
"OpenNN Exception: DataSet class.\n"
9011 <<
"Tensor<Index, 1> DataSet::calculate_local_outlier_factor_outliers(const Index&, const Index&, const type&) const method.\n"
9012 <<
"k_neighbors(" << k_neighbors <<
") should be a positive integer value\n";
9014 throw logic_error(buffer.str());
9017 if(contamination < type(0) && contamination > type(0.5))
9019 ostringstream buffer;
9021 buffer <<
"OpenNN Exception: DataSet class.\n"
9022 <<
"Tensor<Index, 1> DataSet::calculate_local_outlier_factor_outliers(const Index&, const Index&, const type&) const method.\n"
9023 <<
"Outlier contamination(" << contamination <<
") should be a value between 0.0 and 0.5\n";
9025 throw logic_error(buffer.str());
9030 bool kdtree =
false;
9032 Index k = min(k_neighbors, samples_number-1);
9033 Index min_samples_leaf_fix = max(min_samples_leaf,
static_cast<Index
>(0));
9035 if(min_samples_leaf == 0 && samples_number > 5000)
9037 min_samples_leaf_fix = 200;
9038 k = min(k, min_samples_leaf_fix-1);
9041 else if(min_samples_leaf!=0 && min_samples_leaf < samples_number/2)
9043 k = min(k, min_samples_leaf_fix-1);
9047 Tensor<list<Index>, 1> k_nearest_indexes;
9049 kdtree ? k_nearest_indexes = calculate_kd_tree_neighbors(k, min_samples_leaf_fix)
9053 const Tensor<type, 1> average_reachabilities = calculate_average_reachability(k_nearest_indexes, k);
9056 const Tensor<type, 1> LOF_value = calculate_local_outlier_factor(k_nearest_indexes, average_reachabilities, k);
9059 Tensor<Index, 1> outlier_indexes;
9061 contamination > type(0)
9062 ? outlier_indexes = select_outliers_via_contamination(LOF_value, contamination,
true)
9063 : outlier_indexes = select_outliers_via_standard_deviation(LOF_value, type(2.0),
true);
9065 return outlier_indexes;
9069void DataSet::calculate_min_max_indices_list(list<Index>& elements,
const Index& variable_index, type& min, type& max)
const
9072 min = max =
data(elements.front(), variable_index);
9073 for(
auto & sample_index : elements)
9075 value =
data(sample_index, variable_index);
9076 if(min > value) min = value;
9077 else if(max < value) max = value;
9082Index DataSet::split_isolation_tree(Tensor<type, 2> & tree, list<list<Index>>& tree_simulation, list<Index>& tree_index)
const
9140Tensor<type, 2> DataSet::create_isolation_tree(
const Tensor<Index, 1>& indices,
const Index& max_depth)
const
9142 const Index used_samples_number = indices.size();
9147 list<list<Index>> tree_simulation;
9148 list<Index> tree_index;
9149 list<Index> current_node_samples;
9151 Tensor<type, 2> tree(
pow(2, max_depth+1) - 1, 3);
9152 tree.setConstant(numeric_limits<type>::infinity());
9154 for(Index i = 0; i < used_samples_number; i++)
9155 current_node_samples.push_back(indices(i));
9157 tree_simulation.push_back(current_node_samples);
9158 tree(0, 2) = type(used_samples_number);
9159 tree_index.push_back(0);
9161 current_node_samples.clear();
9163 Index current_depth_nodes = 1;
9164 Index next_depth_nodes = 0;
9165 Index current_variable_index = input_variables_indices(rand() % variables_number);
9166 Index current_depth = 0;
9168 Index current_index;
9172 while(current_depth < max_depth && !(tree_simulation.empty()))
9174 current_node_samples = tree_simulation.front();
9175 current_index = tree_index.front();
9177 calculate_min_max_indices_list(current_node_samples, current_variable_index, min, max);
9179 tree(current_index, 0) =
static_cast<type
>((max-min)*(type(rand())/
static_cast<type
>(RAND_MAX))) + min;
9181 tree(current_index, 1) = type(current_variable_index);
9183 next_depth_nodes += split_isolation_tree(tree, tree_simulation, tree_index);
9185 tree_simulation.pop_front();
9186 tree_index.pop_front();
9188 current_depth_nodes--;
9190 if(current_depth_nodes == 0)
9193 swap(current_depth_nodes, next_depth_nodes);
9194 current_variable_index = input_variables_indices(rand() % variables_number);
9202Tensor<Tensor<type, 2>, 1> DataSet::create_isolation_forest(
const Index& trees_number,
const Index& sub_set_size,
const Index& max_depth)
const
9206 Tensor<Tensor<type, 2>, 1> forest(trees_number);
9208 std::random_device rng;
9209 std::mt19937 urng(rng());
9211 for(Index i = 0; i < trees_number; i++)
9213 Tensor<Index, 1> sub_set_indices(sub_set_size);
9214 Tensor<Index, 1> aux_indices = indices;
9215 std::shuffle(aux_indices.data(), aux_indices.data()+samples_number, urng);
9217 for(Index j = 0; j < sub_set_size; j++)
9218 sub_set_indices(j) = aux_indices(j);
9220 forest(i) = create_isolation_tree(sub_set_indices, max_depth);
9227type DataSet::calculate_tree_path(
const Tensor<type, 2>& tree,
const Index& sample_index,
9228 const Index& tree_depth)
const
9230 Index current_index = 0;
9231 Index current_depth = 0;
9232 const Index tree_length = tree.dimensions()[0];
9237 while(current_depth < tree_depth)
9239 if(tree(current_index, 2) == type(1))
9241 return type(current_depth);
9243 else if(current_index*2 >= tree_length ||
9244 (tree(current_index*2+1, 2) == numeric_limits<type>::infinity())
9247 samples = tree(current_index, 2);
9249 return type(
log(samples- type(1)) - (type(2) *(samples- type(1)))/samples + type(0.5772) + type(current_depth));
9253 value =
data(sample_index,
static_cast<Index
>(tree(current_index, 1)));
9255 (value < tree(current_index, 0)) ? current_index = current_index*2 + 1
9256 : current_index = current_index*2 + 2;
9261 samples = tree(current_index, 2);
9262 if(samples == type(1))
9263 return type(current_depth);
9265 return type(
log(samples- type(1))-(type(2.0) *(samples- type(1)))/samples + type(0.5772) + type(current_depth));
9269Tensor<type, 1> DataSet::calculate_average_forest_paths(
const Tensor<Tensor<type, 2>, 1>& forest,
const Index& tree_depth)
const
9272 const Index n_trees = forest.dimensions()[0];
9273 Tensor<type, 1> average_paths(samples_number);
9274 average_paths.setZero();
9276 # pragma omp parallel for
9277 for(Index i = 0; i < samples_number; i++)
9279 for(Index j = 0; j < n_trees; j++)
9280 average_paths(i) += calculate_tree_path(forest(j), i, tree_depth);
9282 average_paths(i) /= type(n_trees);
9284 return average_paths;
9288Tensor<Index, 1> DataSet::calculate_isolation_forest_outliers(
const Index& n_trees,
9289 const Index& subs_set_samples,
9290 const type& contamination)
const
9293 const Index fixed_subs_set_samples = min(samples_number, subs_set_samples);
9294 const Index max_depth =
ceil(
log2(fixed_subs_set_samples))*2;
9295 const Tensor<Tensor<type, 2>, 1> forest = create_isolation_forest(n_trees, fixed_subs_set_samples, max_depth);
9297 const Tensor<type, 1> average_paths = calculate_average_forest_paths(forest, max_depth);
9299 Tensor<Index, 1> outlier_indexes;
9301 contamination > type(0)
9302 ? outlier_indexes = select_outliers_via_contamination(average_paths, contamination,
false)
9303 : outlier_indexes = select_outliers_via_standard_deviation(average_paths, type(2.0), false);
9305 return outlier_indexes;
9321 ostringstream buffer;
9323 buffer <<
"OpenNN Exception: DataSet class.\n"
9324 <<
"Tensor<type, 2> calculate_cross_correlations(const Index& lags_number) const method.\n"
9326 <<
") is greater than the number of samples("
9327 << samples_number <<
") \n";
9329 throw logic_error(buffer.str());
9334 const Index input_columns_number = get_input_time_series_columns_number();
9335 const Index target_columns_number = get_target_time_series_columns_number();
9337 const Index input_target_columns_number = input_columns_number + target_columns_number;
9339 const Tensor<Index, 1> input_columns_indices = get_input_time_series_columns_indices();
9340 const Tensor<Index, 1> target_columns_indices = get_target_time_series_columns_indices();
9342 Index input_target_numeric_column_number = 0;
9346 for(Index i = 0; i < input_target_columns_number; i++)
9348 if(i < input_columns_number)
9350 const Index column_index = input_columns_indices(i);
9352 const ColumnType input_column_type = time_series_columns(column_index).type;
9354 if(input_column_type == ColumnType::Numeric)
9356 input_target_numeric_column_number++;
9361 const Index column_index = target_columns_indices(counter);
9363 const ColumnType target_column_type = time_series_columns(column_index).type;
9365 if(target_column_type == ColumnType::Numeric)
9367 input_target_numeric_column_number++;
9374 Index new_lags_number;
9389 Tensor<type, 2> autocorrelations(input_target_numeric_column_number, new_lags_number);
9390 Tensor<type, 1> autocorrelations_vector(new_lags_number);
9391 Tensor<type, 2> input_i;
9392 Index counter_i = 0;
9394 for(Index i = 0; i < columns_number; i++)
9396 if(time_series_columns(i).column_use != VariableUse::UnusedVariable && time_series_columns(i).type == ColumnType::Numeric)
9399 cout <<
"Calculating " << time_series_columns(i).name <<
" autocorrelations" << endl;
9406 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
9408 autocorrelations_vector = OpenNN::autocorrelations(thread_pool_device, current_input_i, new_lags_number);
9410 for(Index j = 0; j < new_lags_number; j++)
9412 autocorrelations (counter_i, j) = autocorrelations_vector(j) ;
9418 return autocorrelations;
9430 ostringstream buffer;
9432 buffer <<
"OpenNN Exception: DataSet class.\n"
9433 <<
"Tensor<type, 3> calculate_cross_correlations(const Index& lags_number) const method.\n"
9435 <<
") is greater than the number of samples("
9436 << samples_number <<
") \n";
9438 throw logic_error(buffer.str());
9443 const Index input_columns_number = get_input_time_series_columns_number();
9444 const Index target_columns_number = get_target_time_series_columns_number();
9446 const Index input_target_columns_number = input_columns_number + target_columns_number;
9448 const Tensor<Index, 1> input_columns_indices = get_input_time_series_columns_indices();
9449 const Tensor<Index, 1> target_columns_indices = get_target_time_series_columns_indices();
9451 Index input_target_numeric_column_number = 0;
9454 for(Index i = 0; i < input_target_columns_number; i++)
9456 if(i < input_columns_number)
9458 const Index column_index = input_columns_indices(i);
9460 const ColumnType input_column_type = time_series_columns(column_index).type;
9462 if(input_column_type == ColumnType::Numeric)
9464 input_target_numeric_column_number++;
9469 const Index column_index = target_columns_indices(counter);
9471 ColumnType target_column_type = time_series_columns(column_index).type;
9473 if(target_column_type == ColumnType::Numeric)
9475 input_target_numeric_column_number++;
9481 Index new_lags_number;
9496 Tensor<type, 3> cross_correlations(input_target_numeric_column_number, input_target_numeric_column_number, new_lags_number);
9497 Tensor<type, 1> cross_correlations_vector(new_lags_number);
9498 Tensor<type, 2> input_i;
9499 Tensor<type, 2> input_j;
9500 Index counter_i = 0;
9501 Index counter_j = 0;
9503 for(Index i = 0; i < columns_number; i++)
9505 if(time_series_columns(i).column_use != VariableUse::UnusedVariable && time_series_columns(i).type == ColumnType::Numeric)
9509 if(
display) cout <<
"Calculating " << time_series_columns(i).name <<
" cross correlations:" << endl;
9518 for(Index j = 0; j < columns_number; j++)
9520 if(time_series_columns(j).column_use != VariableUse::UnusedVariable && time_series_columns(j).type == ColumnType::Numeric)
9524 if(
display) cout <<
" -VS- " << time_series_columns(j).name << endl;
9532 const TensorMap<Tensor<type, 1>> current_input_i(input_i.data(), input_i.dimension(0));
9533 const TensorMap<Tensor<type, 1>> current_input_j(input_j.data(), input_j.dimension(0));
9535 cross_correlations_vector = OpenNN::cross_correlations(thread_pool_device, current_input_i, current_input_j, new_lags_number);
9537 for(Index k = 0; k < new_lags_number; k++)
9539 cross_correlations (counter_i, counter_j, k) = cross_correlations_vector(k) ;
9549 return cross_correlations;
9560 set(samples_number, variables_number);
9562 data.setConstant(value);
9576 set(samples_number, variables_number);
9589 set(samples_number, variables_number);
9591 for(Index i = 0; i < samples_number; i++)
9593 for(Index j = 0; j < variables_number; j++)
9595 data(i,j) =
static_cast<type
>(j);
9609 const Index inputs_number = variables_number-1;
9611 set(samples_number, variables_number);
9615 #pragma omp parallel for
9617 for(Index i = 0; i < samples_number; i++)
9621 for(Index j = 0; j < inputs_number-1; j++)
9623 const type value =
data(i,j);
9624 const type next_value =
data(i,j+1);
9626 rosenbrock += (type(1) - value)*(type(1) - value) + type(100)*(next_value-value*value)*(next_value-value*value);
9629 data(i, inputs_number) = rosenbrock;
9637void DataSet::generate_sum_data(
const Index& samples_number,
const Index& variables_number)
9639 set(samples_number,variables_number);
9643 for(Index i = 0; i < samples_number; i++)
9645 for(Index j = 0; j < variables_number-1; j++)
9647 data(i,variables_number-1) +=
data(i,j);
9666 const Index used_variables_number = used_variables_indices.size();
9670 if(minimums.size() != used_variables_number)
9672 ostringstream buffer;
9674 buffer <<
"OpenNN Exception: DataSet class.\n"
9675 <<
"Tensor<Index, 1> filter_data(const Tensor<type, 1>&, const Tensor<type, 1>&) method.\n"
9676 <<
"Size of minimums(" << minimums.size() <<
") is not equal to number of variables(" << used_variables_number <<
").\n";
9678 throw logic_error(buffer.str());
9681 if(maximums.size() != used_variables_number)
9683 ostringstream buffer;
9685 buffer <<
"OpenNN Exception: DataSet class.\n"
9686 <<
"Tensor<Index, 1> filter_data(const Tensor<type, 1>&, const Tensor<type, 1>&) method.\n"
9687 <<
"Size of maximums(" << maximums.size() <<
") is not equal to number of variables(" << used_variables_number <<
").\n";
9689 throw logic_error(buffer.str());
9694 const Index samples_number = get_samples_number();
9696 Tensor<type, 1> filtered_indices(samples_number);
9697 filtered_indices.setZero();
9700 const Index used_samples_number = used_samples_indices.size();
9702 Index sample_index = 0;
9704 for(Index i = 0; i < used_variables_number; i++)
9706 const Index variable_index = used_variables_indices(i);
9708 for(Index j = 0; j < used_samples_number; j++)
9710 sample_index = used_samples_indices(j);
9712 if(
get_sample_use(sample_index) == SampleUse::UnusedSample)
continue;
9714 if(
isnan(
data(sample_index, variable_index)))
continue;
9716 if(
abs(
data(sample_index, variable_index) - minimums(i)) <= type(NUMERIC_LIMITS_MIN)
9717 ||
abs(
data(sample_index, variable_index) - maximums(i)) <= type(NUMERIC_LIMITS_MIN))
continue;
9719 if(
data(sample_index,variable_index) < minimums(i)
9720 ||
data(sample_index,variable_index) > maximums(i))
9722 filtered_indices(sample_index) = type(1);
9729 const Index filtered_samples_number =
9730 static_cast<Index
>(count_if(filtered_indices.data(),
9731 filtered_indices.data()+filtered_indices.size(), [](type value)
9732 {return value > static_cast<type>(0.5);}));
9734 Tensor<Index, 1> filtered_samples_indices(filtered_samples_number);
9738 for(Index i = 0; i < samples_number; i++)
9740 if(filtered_indices(i) >
static_cast<type
>(0.5))
9742 filtered_samples_indices(index) = i;
9747 return filtered_samples_indices;
9755 const Index samples_number = get_samples_number();
9757 #pragma omp parallel for
9759 for(Index i = 0; i <samples_number; i++)
9773 const Tensor<type, 1> means = mean(
data, used_samples_indices, used_variables_indices);
9775 const Index samples_number = used_samples_indices.size();
9776 const Index variables_number = used_variables_indices.size();
9778 Index current_variable;
9779 Index current_sample;
9781#pragma omp parallel for schedule(dynamic)
9783 for(Index j = 0; j < variables_number; j++)
9785 current_variable = used_variables_indices(j);
9787 for(Index i = 0; i < samples_number; i++)
9789 current_sample = used_samples_indices(i);
9791 if(
isnan(
data(current_sample, current_variable)))
9793 data(current_sample,current_variable) = means(j);
9807 const Tensor<type, 1> medians = median(
data, used_samples_indices, used_variables_indices);
9809 const Index variables_number = used_variables_indices.size();
9810 const Index samples_number = used_samples_indices.size();
9812#pragma omp parallel for schedule(dynamic)
9814 for(Index j = 0; j < variables_number; j++)
9816 for(Index i = 0 ; i < samples_number ; i++)
9818 if(
isnan(
data(used_samples_indices(i),used_variables_indices(j))))
data(used_samples_indices(i),used_variables_indices(j)) = medians(j);
9832 case MissingValuesMethod::Unuse:
9838 case MissingValuesMethod::Mean:
9844 case MissingValuesMethod::Median:
9853void DataSet::read_csv()
9857 if(!has_time_columns() && !has_categorical_columns())
9859 read_csv_2_simple();
9861 read_csv_3_simple();
9865 read_csv_2_complete();
9867 read_csv_3_complete();
9873Tensor<string, 1> DataSet::get_default_columns_names(
const Index& columns_number)
9875 Tensor<string, 1> columns_names(columns_number);
9877 for(Index i = 0; i < columns_number; i++)
9879 ostringstream buffer;
9881 buffer <<
"column_" << i+1;
9883 columns_names(i) = buffer.str();
9886 return columns_names;
9890void DataSet::read_csv_1()
9894 ostringstream buffer;
9896 buffer <<
"OpenNN Exception: DataSet class.\n"
9897 <<
"void read_csv() method.\n"
9898 <<
"Data file name is empty.\n";
9900 throw logic_error(buffer.str());
9907 ostringstream buffer;
9909 buffer <<
"OpenNN Exception: DataSet class.\n"
9910 <<
"void read_csv() method.\n"
9913 throw logic_error(buffer.str());
9918 if(
display) cout <<
"Setting data file preview..." << endl;
9922 data_file_preview.resize(lines_number);
9926 Index lines_count = 0;
9930 getline(file, line);
9936 if(line.empty())
continue;
9938 check_separators(line);
9940 check_special_characters(line);
9942 data_file_preview(lines_count) = get_tokens(line, separator_char);
9946 if(lines_count == lines_number)
break;
9953 if(data_file_preview(0).size() == 0)
9955 ostringstream buffer;
9957 buffer <<
"OpenNN Exception: DataSet class.\n"
9958 <<
"void read_csv_1() method.\n"
9961 throw logic_error(buffer.str());
9966 if(
display) cout <<
"Setting rows labels..." << endl;
9968 string first_name = data_file_preview(0)(0);
9969 transform(first_name.begin(), first_name.end(), first_name.begin(), ::tolower);
9971 const Index columns_number =
has_rows_labels ? data_file_preview(0).size()-1 : data_file_preview(0).size();
9973 columns.resize(columns_number);
9979 ostringstream buffer;
9981 buffer <<
"OpenNN Exception: DataSet class.\n"
9982 <<
"void read_csv_1() method.\n"
9983 <<
"Some columns names are numeric.\n";
9985 throw logic_error(buffer.str());
9990 if(
display) cout <<
"Setting columns names..." << endl;
9995 Eigen::array<Eigen::Index, 1>({data_file_preview(0).size()-1})))
10005 if(
display) cout <<
"Setting columns types..." << endl;
10007 Index column_index = 0;
10009 for(Index i = 0; i < data_file_preview(0).dimension(0); i++)
10013 if((is_date_time_string(data_file_preview(1)(i)) && data_file_preview(1)(i) !=
missing_values_label)
10014 || (is_date_time_string(data_file_preview(2)(i)) && data_file_preview(2)(i) !=
missing_values_label)
10015 || (is_date_time_string(data_file_preview(lines_number-2)(i)) && data_file_preview(lines_number-2)(i) !=
missing_values_label)
10016 || (is_date_time_string(data_file_preview(lines_number-1)(i)) && data_file_preview(lines_number-1)(i) !=
missing_values_label))
10025 columns(column_index).type = ColumnType::DateTime;
10028 else if(((is_numeric_string(data_file_preview(1)(i)) && data_file_preview(1)(i) !=
missing_values_label) || data_file_preview(1)(i).empty())
10029 || ((is_numeric_string(data_file_preview(2)(i)) && data_file_preview(2)(i) !=
missing_values_label) || data_file_preview(1)(i).empty())
10030 || ((is_numeric_string(data_file_preview(lines_number-2)(i)) && data_file_preview(lines_number-2)(i) !=
missing_values_label) || data_file_preview(1)(i).empty())
10031 || ((is_numeric_string(data_file_preview(lines_number-1)(i)) && data_file_preview(lines_number-1)(i) !=
missing_values_label) || data_file_preview(1)(i).empty()))
10033 columns(column_index).type = ColumnType::Numeric;
10038 columns(column_index).type = ColumnType::Categorical;
10045 set_column_type(
time_column, DataSet::ColumnType::DateTime);
10050void DataSet::read_csv_2_simple()
10054 if(!file.is_open())
10056 ostringstream buffer;
10058 buffer <<
"OpenNN Exception: DataSet class.\n"
10059 <<
"void read_csv_2_simple() method.\n"
10062 throw logic_error(buffer.str());
10066 Index line_number = 0;
10074 getline(file, line);
10080 if(line.empty())
continue;
10086 Index samples_count = 0;
10088 Index tokens_count;
10090 if(
display) cout <<
"Setting data dimensions..." << endl;
10095 const Index raw_columns_number =
has_rows_labels ? columns_number + 1 : columns_number;
10101 getline(file, line);
10107 if(line.empty())
continue;
10109 tokens_count = count_tokens(line, separator_char);
10111 if(tokens_count != raw_columns_number)
10113 ostringstream buffer;
10115 buffer <<
"OpenNN Exception: DataSet class.\n"
10116 <<
"void read_csv_2_simple() method.\n"
10117 <<
"Line " << line_number <<
": Size of tokens("
10118 << tokens_count <<
") is not equal to number of columns("
10119 << raw_columns_number <<
").\n";
10121 throw logic_error(buffer.str());
10129 data.resize(samples_count, columns_number);
10133 samples_uses.resize(samples_count);
10134 samples_uses.setConstant(SampleUse::Training);
10140void DataSet::read_csv_3_simple()
10144 if(!file.is_open())
10146 ostringstream buffer;
10148 buffer <<
"OpenNN Exception: DataSet class.\n"
10149 <<
"void read_csv_2_simple() method.\n"
10152 throw logic_error(buffer.str());
10155 const bool is_float = is_same<type, float>::value;
10167 getline(file, line);
10169 if(line.empty())
continue;
10181 Tensor<string, 1> tokens(raw_columns_number);
10183 const Index samples_number =
data.dimension(0);
10187 if(
display) cout <<
"Reading data..." << endl;
10189 Index sample_index = 0;
10190 Index column_index = 0;
10194 getline(file, line);
10200 if(line.empty())
continue;
10202 fill_tokens(line, separator_char, tokens);
10204 for(j = 0; j < raw_columns_number; j++)
10210 rows_labels(sample_index) = tokens(j);
10214 data(sample_index, column_index) =
static_cast<type
>(NAN);
10219 data(sample_index, column_index) = type(strtof(tokens(j).
data(), NULL));
10224 data(sample_index, column_index) = type(stof(tokens(j)));
10235 data_file_preview(data_file_preview_index) = tokens;
10239 if(
display) cout <<
"Data read succesfully..." << endl;
10243 check_constant_columns();
10247 if(
display) cout <<
"Checking binary columns..." << endl;
10249 set_binary_simple_columns();
10253void DataSet::read_csv_2_complete()
10257 if(!file.is_open())
10259 ostringstream buffer;
10261 buffer <<
"OpenNN Exception: DataSet class.\n"
10262 <<
"void read_csv_2_complete() method.\n"
10265 throw logic_error(buffer.str());
10272 Tensor<string, 1> tokens;
10274 Index lines_count = 0;
10275 Index tokens_count;
10277 const Index columns_number = columns.size();
10279 for(
unsigned j = 0; j < columns_number; j++)
10281 if(columns(j).type != ColumnType::Categorical)
10283 columns(j).column_use = VariableUse::Input;
10293 getline(file, line);
10297 if(line.empty())
continue;
10305 if(
display) cout <<
"Setting data dimensions..." << endl;
10307 const Index raw_columns_number =
has_rows_labels ? columns_number + 1 : columns_number;
10309 Index column_index = 0;
10313 getline(file, line);
10317 if(line.empty())
continue;
10319 tokens = get_tokens(line, separator_char);
10321 tokens_count = tokens.size();
10323 if(
static_cast<unsigned>(tokens_count) != raw_columns_number)
10325 const string message =
10326 "Sample " + to_string(lines_count+1) +
" error:\n"
10327 "Size of tokens (" + to_string(tokens_count) +
") is not equal to number of columns (" + to_string(raw_columns_number) +
").\n"
10328 "Please check the format of the data file (e.g: Use of commas both as decimal and column separator)";
10330 throw logic_error(message);
10333 for(
unsigned j = 0; j < raw_columns_number; j++)
10339 if(columns(column_index).type == ColumnType::Categorical)
10341 if(find(columns(column_index).categories.data(), columns(column_index).categories.data() + columns(column_index).categories.size(), tokens(j)) == (columns(column_index).categories.data() + columns(column_index).categories.size()))
10349 columns(column_index).add_category(tokens(j));
10361 if(
display) cout <<
"Setting types..." << endl;
10363 for(Index j = 0; j < columns_number; j++)
10365 if(columns(j).type == ColumnType::Categorical)
10367 if(columns(j).categories.size() == 2)
10369 columns(j).type = ColumnType::Binary;
10376 const Index samples_number =
static_cast<unsigned>(lines_count);
10380 data.resize(
static_cast<Index
>(samples_number), variables_number);
10387 samples_uses.resize(
static_cast<Index
>(samples_number));
10389 samples_uses.setConstant(SampleUse::Training);
10395void DataSet::read_csv_3_complete()
10399 if(!file.is_open())
10401 ostringstream buffer;
10403 buffer <<
"OpenNN Exception: DataSet class.\n"
10404 <<
"void read_csv_3_complete() method.\n"
10407 throw logic_error(buffer.str());
10412 const Index columns_number = columns.size();
10414 const Index raw_columns_number =
has_rows_labels ? columns_number+1 : columns_number;
10418 Tensor<string, 1> tokens;
10422 unsigned sample_index = 0;
10423 unsigned variable_index = 0;
10424 unsigned column_index = 0;
10432 getline(file, line);
10436 if(line.empty())
continue;
10444 if(
display) cout <<
"Reading data..." << endl;
10448 getline(file, line);
10454 if(line.empty())
continue;
10456 tokens = get_tokens(line, separator_char);
10458 variable_index = 0;
10461 for(Index j = 0; j < raw_columns_number; j++)
10467 rows_labels(sample_index) = tokens(j);
10470 else if(columns(column_index).type == ColumnType::Numeric)
10474 data(sample_index, variable_index) =
static_cast<type
>(NAN);
10481 data(sample_index, variable_index) =
static_cast<type
>(stod(tokens(j)));
10484 catch(invalid_argument)
10486 ostringstream buffer;
10488 buffer <<
"OpenNN Exception: DataSet class.\n"
10489 <<
"void read_csv_3_complete() method.\n"
10490 <<
"Sample " << sample_index <<
"; Invalid number: " << tokens(j) <<
"\n";
10492 throw logic_error(buffer.str());
10496 else if(columns(column_index).type == ColumnType::DateTime)
10500 data(sample_index, variable_index) =
static_cast<type
>(NAN);
10505 data(sample_index, variable_index) =
static_cast<type
>(date_to_timestamp(tokens(j), gmt));
10510 else if(columns(column_index).type == ColumnType::Categorical)
10512 for(Index k = 0; k < columns(column_index).get_categories_number(); k++)
10516 data(sample_index, variable_index) =
static_cast<type
>(NAN);
10518 else if(tokens(j) == columns(column_index).categories(k))
10520 data(sample_index, variable_index) = type(1);
10526 else if(columns(column_index).type == ColumnType::Binary)
10530 data(sample_index, variable_index) =
static_cast<type
>(NAN);
10532 else if(columns(column_index).categories.size() > 0 && tokens(j) == columns(column_index).categories(0))
10534 data(sample_index, variable_index) = type(1);
10536 else if(tokens(j) == columns(column_index).name)
10538 data(sample_index, variable_index) = type(1);
10552 data_file_preview(data_file_preview_index) = tokens;
10554 if(
display) cout <<
"Data read succesfully..." << endl;
10560 check_constant_columns();
10564 if(
display) cout <<
"Checking binary columns..." << endl;
10566 set_binary_simple_columns();
10571void DataSet::check_separators(
const string& line)
const
10573 if(line.find(
',') == string::npos
10574 && line.find(
';') == string::npos
10575 && line.find(
' ') == string::npos
10576 && line.find(
'\t') == string::npos)
return;
10580 if(line.find(separator_char) == string::npos)
10582 const string message =
10584 "Line: '" + line +
"'";
10586 throw logic_error(message);
10591 if(line.find(
',') != string::npos)
10593 const string message =
10594 "Error: Found comma (',') in data file " +
data_file_name +
", but separator is space (' ').";
10596 throw logic_error(message);
10598 if(line.find(
';') != string::npos)
10600 const string message =
10601 "Error: Found semicolon (';') in data file " +
data_file_name +
", but separator is space (' ').";
10603 throw logic_error(message);
10608 if(line.find(
',') != string::npos)
10610 const string message =
10611 "Error: Found comma (',') in data file " +
data_file_name +
", but separator is tab (' ').";
10613 throw logic_error(message);
10615 if(line.find(
';') != string::npos)
10617 const string message =
10618 "Error: Found semicolon (';') in data file " +
data_file_name +
", but separator is tab (' ').";
10620 throw logic_error(message);
10625 if(line.find(
";") != string::npos)
10627 const string message =
10628 "Error: Found semicolon (';') in data file " +
data_file_name +
", but separator is comma (',').";
10630 throw logic_error(message);
10633 else if(
separator == Separator::Semicolon)
10635 if(line.find(
",") != string::npos)
10637 const string message =
10638 "Error: Found comma (',') in data file " +
data_file_name +
", but separator is semicolon (';'). " + line;
10640 throw logic_error(message);
10646void DataSet::check_special_characters(
const string & line)
const
10648 if(line.find_first_of(
"|@#~€¬^*") != string::npos)
10650 const string message =
10651 "Error: found special characters in line: " + line +
". Please, review the file.";
10653 throw logic_error(message);
10668bool DataSet::has_binary_columns()
const
10670 const Index variables_number = columns.size();
10672 for(Index i = 0; i < variables_number; i++)
10674 if(columns(i).type == ColumnType::Binary)
return true;
10681bool DataSet::has_categorical_columns()
const
10683 const Index variables_number = columns.size();
10685 for(Index i = 0; i < variables_number; i++)
10687 if(columns(i).type == ColumnType::Categorical)
return true;
10694bool DataSet::has_time_columns()
const
10696 const Index columns_number = columns.size();
10698 for(Index i = 0; i < columns_number; i++)
10700 if(columns(i).type == ColumnType::DateTime)
return true;
10707bool DataSet::has_time_time_series_columns()
const
10709 const Index time_series_columns_number = time_series_columns.size();
10711 for(Index i = 0; i < time_series_columns_number; i++)
10713 if(time_series_columns(i).type == ColumnType::DateTime)
return true;
10721bool DataSet::has_selection()
const
10729Tensor<Index, 1> DataSet::count_nan_columns()
const
10732 const Index rows_number = get_samples_number();
10735 nan_columns.setZero();
10737 for(Index column_index = 0; column_index < columns_number; column_index++)
10741 for(Index row_index = 0; row_index < rows_number; row_index++)
10743 if(
isnan(
data(row_index,current_variable_index)))
10745 nan_columns(column_index)++;
10750 return nan_columns;
10754Index DataSet::count_rows_with_nan()
const
10756 Index rows_with_nan = 0;
10758 const Index rows_number =
data.dimension(0);
10759 const Index columns_number =
data.dimension(1);
10763 for(Index row_index = 0; row_index < rows_number; row_index++)
10767 for(Index column_index = 0; column_index < columns_number; column_index++)
10769 if(
isnan(
data(row_index, column_index)))
10779 return rows_with_nan;
10783Index DataSet::count_nan()
const
10785 const Index rows_number =
data.dimension(0);
10786 const Index columns_number =
data.dimension(1);
10790 #pragma omp parallel for reduction(+: count)
10792 for(Index row_index = 0; row_index < rows_number; row_index++)
10794 for(Index column_index = 0; column_index < columns_number; column_index++)
10796 if(
isnan(
data(row_index, column_index))) count++;
10804void DataSet::set_missing_values_number(
const Index& new_missing_values_number)
10810void DataSet::set_missing_values_number()
10816void DataSet::set_columns_missing_values_number(
const Tensor<Index, 1>& new_columns_missing_values_number)
10818 columns_missing_values_number = new_columns_missing_values_number;
10822void DataSet::set_columns_missing_values_number()
10824 columns_missing_values_number = count_nan_columns();
10828void DataSet::set_rows_missing_values_number(
const Index& new_rows_missing_values_number)
10830 rows_missing_values_number = new_rows_missing_values_number;
10834void DataSet::set_rows_missing_values_number()
10836 rows_missing_values_number = count_rows_with_nan();
10840void DataSet::fix_repeated_names()
10844 const Index columns_number = columns.size();
10846 map<string, Index> columns_count_map;
10848 for(Index i = 0; i < columns_number; i++)
10850 auto result = columns_count_map.insert(pair<string, Index>(columns(i).name, 1));
10852 if(!result.second) result.first->second++;
10855 for(
auto & element : columns_count_map)
10857 if(element.second > 1)
10859 const string repeated_name = element.first;
10860 Index repeated_index = 1;
10862 for(Index i = 0; i < columns.size(); i++)
10864 if(columns(i).name == repeated_name)
10866 columns(i).name = columns(i).name +
"_" + to_string(repeated_index);
10875 if(has_categorical_columns() || has_binary_columns())
10879 const Index variables_number = variables_names.size();
10881 map<string, Index> variables_count_map;
10883 for(Index i = 0; i < variables_number; i++)
10885 auto result = variables_count_map.insert(pair<string, Index>(variables_names(i), 1));
10887 if(!result.second) result.first->second++;
10890 for(
auto & element : variables_count_map)
10892 if(element.second > 1)
10894 const string repeated_name = element.first;
10896 for(Index i = 0; i < variables_number; i++)
10898 if(variables_names(i) == repeated_name)
10902 if(columns(column_index).type != ColumnType::Categorical)
continue;
10904 variables_names(i) = variables_names(i) +
"_" + columns(column_index).name;
10915Tensor<Index, 1> DataSet::push_back(
const Tensor<Index, 1>& old_vector,
const Index& new_string)
const
10917 const Index old_size = old_vector.size();
10919 const Index new_size = old_size+1;
10921 Tensor<Index, 1> new_vector(new_size);
10923 for(Index i = 0; i < old_size; i++) new_vector(i) = old_vector(i);
10925 new_vector(new_size-1) = new_string;
10931Tensor<string, 1> DataSet::push_back(
const Tensor<string, 1>& old_vector,
const string& new_string)
const
10933 const Index old_size = old_vector.size();
10935 const Index new_size = old_size+1;
10937 Tensor<string, 1> new_vector(new_size);
10939 for(Index i = 0; i < old_size; i++) new_vector(i) = old_vector(i);
10941 new_vector(new_size-1) = new_string;
10947void DataSet::initialize_sequential(Tensor<Index, 1>& new_tensor,
10948 const Index& start,
const Index& step,
const Index& end)
const
10950 const Index new_size = (end-start)/step+1;
10952 new_tensor.resize(new_size);
10953 new_tensor(0) = start;
10955 for(Index i = 1; i < new_size-1; i++)
10957 new_tensor(i) = new_tensor(i-1)+step;
10960 new_tensor(new_size-1) = end;
10964void DataSet::intialize_sequential(Tensor<type, 1>& new_tensor,
10965 const type& start,
const type& step,
const type& end)
const
10967 const Index new_size = Index((end-start)/type(step) + type(1));
10969 new_tensor.resize(new_size);
10970 new_tensor(0) = start;
10972 for(Index i = 1; i < new_size-1; i++)
10974 new_tensor(i) = new_tensor(i-1)+step;
10977 new_tensor(new_size-1) = end;
10981Tensor<Index, 2> DataSet::split_samples(
const Tensor<Index, 1>& samples_indices,
const Index& new_batch_size)
const
10983 const Index samples_number = samples_indices.dimension(0);
10985 Index batches_number;
10986 Index batch_size = new_batch_size;
10988 if(samples_number < batch_size)
10990 batches_number = 1;
10991 batch_size = samples_number;
10995 batches_number = samples_number / batch_size;
10998 Tensor<Index, 2> batches(batches_number, batch_size);
11002 for(Index i = 0; i < batches_number; ++i)
11004 for(Index j = 0; j < batch_size; ++j)
11006 batches(i,j) = samples_indices(count);
11016void DataSetBatch::fill(
const Tensor<Index, 1>& samples,
11017 const Tensor<Index, 1>& inputs,
11018 const Tensor<Index, 1>& targets)
11020 const Tensor<type, 2>&
data = data_set_pointer->get_data();
11022 const Tensor<Index, 1>& input_variables_dimensions = data_set_pointer->get_input_variables_dimensions();
11024 if(input_variables_dimensions.size() == 1)
11026 fill_submatrix(
data, samples, inputs, inputs_2d.data());
11028 else if(input_variables_dimensions.size() == 4)
11030 const Index samples_number = input_variables_dimensions(0);
11031 const Index channels_number = input_variables_dimensions(1);
11032 const Index rows_number = input_variables_dimensions(2);
11033 const Index columns_number = input_variables_dimensions(3);
11035 inputs_4d.resize(samples_number, channels_number, rows_number, columns_number);
11039 for(Index image = 0; image < samples_number; image++)
11043 for(Index channel = 0; channel < channels_number; channel++)
11045 for(Index row = 0; row < rows_number; row++)
11047 for(Index column = 0; column < columns_number; column++)
11049 inputs_4d(image, channel, row, column) =
data(image, index);
11057 fill_submatrix(
data, samples, targets, targets_2d.data());
11063 set(new_samples_number, new_data_set_pointer);
11067void DataSetBatch::set(
const Index& new_samples_number,
DataSet* new_data_set_pointer)
11069 samples_number = new_samples_number;
11071 data_set_pointer = new_data_set_pointer;
11074 const Index target_variables_number = data_set_pointer->get_target_variables_number();
11076 const Tensor<Index, 1> input_variables_dimensions = data_set_pointer->get_input_variables_dimensions();
11078 if(input_variables_dimensions.rank() == 1)
11080 inputs_2d.resize(samples_number, input_variables_number);
11082 else if(input_variables_dimensions.rank() == 3)
11084 const Index channels_number = input_variables_dimensions(0);
11085 const Index rows_number = input_variables_dimensions(1);
11086 const Index columns_number = input_variables_dimensions(2);
11088 inputs_4d.resize(samples_number, channels_number, rows_number, columns_number);
11091 targets_2d.resize(samples_number, target_variables_number);
11095Index DataSetBatch::get_samples_number()
const
11097 return samples_number;
11101void DataSetBatch::print()
const
11103 cout <<
"Batch structure" << endl;
11105 cout <<
"Inputs:" << endl;
11106 cout << inputs_2d << endl;
11108 cout <<
"Targets:" << endl;
11109 cout << targets_2d << endl;
11113void DataSet::shuffle()
11116 mt19937 urng(rng());
11118 const Index data_rows =
data.dimension(0);
11119 const Index data_columns =
data.dimension(1);
11121 Tensor<Index, 1> indices(data_rows);
11123 for(Index i = 0; i < data_rows; i++) indices(i) = i;
11125 std::shuffle(&indices(0), &indices(data_rows-1), urng);
11127 Tensor<type, 2> new_data(data_rows, data_columns);
11128 Tensor<string, 1> new_rows_labels(data_rows);
11132 for(Index i = 0; i < data_rows; i++)
11134 index = indices(i);
11136 new_rows_labels(i) = rows_labels(index);
11138 for(Index j = 0; j < data_columns; j++)
11140 new_data(i,j) =
data(index,j);
11145 rows_labels = new_rows_labels;
11149bool DataSet::get_has_rows_labels()
const
This class represents the concept of data set for data modelling problems, such as approximation,...
Index calculate_testing_negatives(const Index &) const
bool is_sample_used(const Index &) const
Tensor< type, 1 > calculate_input_variables_maximums() const
Returns a vector containing the maximums of the input variables.
void transform_time_series_columns()
This method transforms the columns into time series for forecasting problems.
Tensor< type, 2 > get_target_data() const
Index get_training_samples_number() const
Returns the number of samples in the data set which will be used for training.
Tensor< type, 1 > calculate_used_variables_minimums() const
Returns a vector containing the maximum of the used variables.
Tensor< type, 2 > calculate_autocorrelations(const Index &=10) const
VariableUse get_column_use(const Index &) const
Returns a vector containing the use of the column, without taking into account the categories.
Index get_target_variables_number() const
Returns the number of target variables of the data set.
Tensor< Descriptives, 1 > calculate_columns_descriptives_training_samples() const
void set_sample_use(const Index &, const SampleUse &)
void set_training()
Sets all the samples in the data set for training.
void impute_missing_values_unuse()
Sets all the samples with missing values to "Unused".
Tensor< Descriptives, 1 > calculate_columns_descriptives_positive_samples() const
Calculate the descriptives of the samples with positive targets in binary classification problems.
void save_data_binary(const string &) const
Saves to the data file the values of the data matrix in binary format.
Tensor< string, 1 > get_target_columns_names() const
Returns a string vector which contains the names of the columns whose uses are Target.
Tensor< string, 1 > get_columns_names() const
Returns a string vector that contains the names of the columns.
Tensor< Descriptives, 1 > calculate_input_variables_descriptives() const
void set_variable_name(const Index &, const string &)
Tensor< Descriptives, 1 > scale_target_variables()
Tensor< Index, 1 > calculate_local_outlier_factor_outliers(const Index &=20, const Index &=0, const type &=type(0)) const
void set_input_variables_dimensions(const Tensor< Index, 1 > &)
Sets new input dimensions in the data set.
Tensor< Index, 1 > get_training_samples_indices() const
Returns the indices of the samples which will be used for training.
void set_input()
Sets all the variables in the data set as input variables.
bool has_columns_names
Header which contains variables name.
void generate_constant_data(const Index &, const Index &, const type &)
void generate_Rosenbrock_data(const Index &, const Index &)
Tensor< string, 1 > get_time_series_variables_names() const
void print_top_input_target_columns_correlations() const
void set_has_columns_names(const bool &)
Sets if the data file contains a header with the names of the columns.
Tensor< Index, 1 > get_testing_samples_indices() const
Returns the indices of the samples which will be used for testing.
void set_columns_number(const Index &)
Tensor< Index, 1 > get_selection_samples_indices() const
Returns the indices of the samples which will be used for selection.
Tensor< type, 1 > calculate_selection_targets_mean() const
Returns the mean values of the target variables on the selection.
const bool & get_display() const
void set_selection()
Sets all the samples in the data set for selection.
Tensor< Histogram, 1 > calculate_columns_distribution(const Index &=10) const
const Tensor< type, 2 > & get_data() const
Index get_variables_number() const
Returns the number of variables in the data set.
void set_data_constant(const type &)
Tensor< type, 2 > get_sample_target_data(const Index &) const
void set_samples_unused()
Sets all the samples in the data set for unused.
Tensor< type, 3 > calculate_cross_correlations(const Index &=10) const
Calculates the cross-correlation between all the variables in the data set.
Tensor< Index, 1 > get_target_columns_indices() const
Returns a indices vector with the positions of the targets.
void split_samples_sequential(const type &training_ratio=static_cast< type >(0.6), const type &selection_ratio=static_cast< type >(0.2), const type &testing_ratio=static_cast< type >(0.2))
void unscale_input_variables(const Tensor< Descriptives, 1 > &)
Index lags_number
Number of lags.
Tensor< Descriptives, 1 > calculate_target_variables_descriptives() const
void set_target()
Sets all the variables in the data set as target variables.
void impute_missing_values_mean()
Substitutes all the missing values by the mean of the corresponding variable.
string data_file_name
Data file name.
Tensor< Descriptives, 1 > calculate_variables_descriptives() const
void check_input_csv(const string &, const char &) const
This method checks if the input data file has the correct format. Returns an error message.
char get_separator_char() const
Returns the string which will be used as separator in the data file.
const bool & get_rows_label() const
Returns true if the data file has rows label, and false otherwise.
Tensor< Tensor< Index, 1 >, 1 > calculate_Tukey_outliers(const type &=type(1.5)) const
void load(const string &)
void load_data_binary()
This method loads the data from a binary data file.
void set_samples_uses(const Tensor< SampleUse, 1 > &)
string missing_values_label
Missing values label.
const Index & get_lags_number() const
Returns the number of lags to be used in a time series prediction application.
void set_variables_unused()
Sets all the variables in the data set as unused variables.
void load_time_series_data_binary(const string &)
This method loads time series data from a binary data.
Index calculate_training_negatives(const Index &) const
Tensor< Index, 1 > get_used_samples_indices() const
Returns the indices of the used samples(those which are not set unused).
void set_missing_values_method(const MissingValuesMethod &)
Tensor< type, 1 > get_sample_data(const Index &) const
Tensor< type, 2 > get_sample_input_data(const Index &) const
bool display
Display messages to screen.
void impute_missing_values_median()
Substitutes all the missing values by the median of the corresponding variable.
void set_separator(const Separator &)
Tensor< type, 1 > calculate_input_variables_minimums() const
Returns a vector containing the minimums of the input variables.
string get_sample_string(const Index &, const string &=",") const
Index get_time_series_columns_number() const
Returns the number of columns in the time series.
Index get_unused_samples_number() const
virtual ~DataSet()
Destructor.
void print_inputs_correlations() const
Print on screen the correlation between variables in the data set.
Index get_unused_variables_number() const
Returns the number of variables which will neither be used as input nor as target.
Index steps_ahead
Number of steps ahead.
bool is_empty() const
Returns true if the data matrix is empty, and false otherwise.
void set_lags_number(const Index &)
Index get_time_columns_number() const
Returns the number of columns whose uses are Time.
const Tensor< Index, 1 > & get_input_variables_dimensions() const
Returns the dimensions of the input variables.
Tensor< Index, 1 > calculate_target_distribution() const
void set_samples_number(const Index &)
Tensor< Index, 1 > get_input_columns_indices() const
Returns a indices vector with the positions of the inputs.
SampleUse get_sample_use(const Index &) const
Tensor< VariableUse, 1 > get_columns_uses() const
Returns the uses of each columns of the data set.
Tensor< string, 1 > get_used_columns_names() const
Returns a string vector which contains the names of the columns used whether Input,...
void set()
Sets zero samples and zero variables in the data set.
void set_has_rows_label(const bool &)
Sets if the data file contains rows label.
Tensor< Index, 1 > get_target_variables_indices() const
Returns the indices of the target variables.
Tensor< string, 1 > unuse_uncorrelated_columns(const type &=type(0.25))
void set_variables_names(const Tensor< string, 1 > &)
void split_samples_random(const type &training_ratio=static_cast< type >(0.6), const type &selection_ratio=static_cast< type >(0.2), const type &testing_ratio=static_cast< type >(0.2))
MissingValuesMethod
Enumeration of available methods for missing values in the data.
Tensor< Index, 1 > get_samples_uses_numbers() const
Tensor< Index, 1 > unuse_repeated_samples()
void set_testing()
Sets all the samples in the data set for testing.
void print_top_inputs_correlations() const
const string & get_missing_values_label() const
Returns the string which will be used as label for the missing values in the data file.
string get_variable_name(const Index &) const
void print_data() const
Prints to the screen the values of the data matrix.
const Separator & get_separator() const
Returns the separator to be used in the data file.
Tensor< type, 1 > calculate_target_variables_minimums() const
Returns a vector containing the minimums of the target variables.
void set_missing_values_label(const string &)
Index get_selection_samples_number() const
Returns the number of samples in the data set which will be used for selection.
const Tensor< SampleUse, 1 > & get_samples_uses() const
Returns the use of every sample (training, selection, testing or unused) in a vector.
Index get_variable_index(const string &name) const
Tensor< type, 2 > get_selection_target_data() const
void save_data() const
Saves to the data file the values of the data matrix.
void save(const string &) const
Tensor< type, 2 > get_testing_data() const
Tensor< Descriptives, 1 > calculate_used_variables_descriptives() const
MissingValuesMethod missing_values_method
Missing values method.
Tensor< type, 2 > get_training_data() const
Tensor< bool, 1 > get_input_columns_binary() const
Returns the input columns of the data set.
Tensor< type, 2 > get_testing_input_data() const
void transform_time_series()
Arranges an input-target DataSet from a time series matrix, according to the number of lags.
VariableUse get_variable_use(const Index &) const
Tensor< type, 1 > calculate_variables_means(const Tensor< Index, 1 > &) const
Tensor< Descriptives, 1 > calculate_columns_descriptives_selection_samples() const
string get_separator_string() const
Returns the string which will be used as separator in the data file.
Separator
Enumeration of available separators for the data file.
Index get_input_variables_number() const
Tensor< Index, 1 > get_unused_columns_indices() const
Returns a indices vector with the positions of the unused columns.
Tensor< type, 2 > read_input_csv(const string &, const char &, const string &, const bool &, const bool &) const
This method loads data from a file and returns a matrix containing the input columns.
Index get_target_columns_number() const
Returns the number of columns whose uses are Target.
Tensor< type, 2 > time_series_data
Tensor< Correlation, 2 > calculate_input_columns_correlations() const
void generate_random_data(const Index &, const Index &)
void scrub_missing_values()
Index get_testing_samples_number() const
Returns the number of samples in the data set which will be used for testing.
const string & get_time_column() const
Returns the indices of the time variables in the data set.
Index calculate_selection_negatives(const Index &) const
Index get_used_columns_number() const
Returns the number of columns that are used.
bool has_nan_row(const Index &) const
Returns true if the given row contains missing values.
Index get_time_series_variables_number() const
Returns the number of variables in the time series data.
void print_missing_values_information() const
void set_default_columns_names()
Index calculate_used_negatives(const Index &) const
Index get_used_samples_number() const
void save_time_series_data_binary(const string &) const
Saves to the data file the values of the time series data matrix in binary format.
string time_column
Index where time variable is located for forecasting applications.
Tensor< Index, 1 > get_used_columns_indices() const
Returns a indices vector with the positions of the used columns.
Tensor< string, 1 > get_variables_names() const
Tensor< type, 2 > get_selection_input_data() const
Tensor< type, 2 > get_time_series_column_data(const Index &) const
Index get_unused_columns_number() const
Returns the number of columns that are not used.
const Index & get_steps_ahead() const
Returns the number of steps ahead to be used in a time series prediction application.
bool has_rows_labels
Header which contains the rows label.
Tensor< Correlation, 2 > calculate_input_target_columns_correlations() const
void set_default_columns_scalers()
void unscale_target_variables(const Tensor< Descriptives, 1 > &)
Tensor< VariableUse, 1 > get_variables_uses() const
static Scaler get_scaling_unscaling_method(const string &)
void set_data_binary_random()
void set_time_column(const string &)
Tensor< Index, 1 > get_variable_indices(const Index &) const
Tensor< type, 1 > get_variable_data(const Index &) const
Tensor< type, 2 > get_selection_data() const
Tensor< type, 2 > get_column_data(const Index &) const
Tensor< string, 1 > get_input_columns_names() const
Returns a string vector that contains the names of the columns whose uses are Input.
void set_data_file_name(const string &)
void print() const
Prints to the screen in text format the main numbers from the data set object.
const bool & get_header_line() const
Returns true if the first line of the data file has a header with the names of the variables,...
Index missing_values_number
Missing values.
Tensor< type, 2 > get_input_data() const
void set_column_name(const Index &, const string &)
void set_display(const bool &)
Tensor< string, 1 > get_target_variables_names() const
Tensor< Index, 1 > get_unused_samples_indices() const
Returns the indices of the samples set unused.
void set_input_columns_unused()
Sets all input columns in the data_set as unused columns.
void print_data_preview() const
Tensor< type, 2 > get_training_input_data() const
Tensor< Index, 1 > get_input_variables_indices() const
Returns the indices of the input variables.
Tensor< string, 1 > get_input_variables_names() const
void set_columns_unused()
Sets all columns in the data_set as unused columns.
Tensor< Index, 1 > filter_data(const Tensor< type, 1 > &, const Tensor< type, 1 > &)
void write_XML(tinyxml2::XMLPrinter &) const
Serializes the data set object into a XML document of the TinyXML library without keep the DOM tree i...
Tensor< Descriptives, 1 > calculate_columns_descriptives_negative_samples() const
Calculate the descriptives of the samples with neagtive targets in binary classification problems.
Tensor< type, 1 > get_samples_uses_percentages() const
void set_data(const Tensor< type, 2 > &)
Tensor< BoxPlot, 1 > calculate_columns_box_plots() const
void generate_sequential_data(const Index &, const Index &)
Tensor< Index, 2 > get_batches(const Tensor< Index, 1 > &, const Index &, const bool &, const Index &buffer_size=100) const
Separator separator
Separator character.
Index get_column_index(const string &) const
void set_columns_names(const Tensor< string, 1 > &)
Tensor< Column, 1 > get_input_columns() const
Returns the input columns of the data set.
Tensor< string, 1 > unuse_constant_columns()
bool is_sample_unused(const Index &) const
void set_column_use(const Index &, const VariableUse &)
MissingValuesMethod get_missing_values_method() const
Returns a string with the method used.
Tensor< Descriptives, 1 > calculate_columns_descriptives_categories(const Index &) const
Index get_columns_number() const
Returns the number of columns in the data set.
Tensor< Index, 1 > get_unused_variables_indices() const
Returns the indices of the unused variables.
bool has_nan() const
Returns true if the data contain missing values.
const string & get_data_file_name() const
Returns the name of the data file.
void set_steps_ahead_number(const Index &)
Tensor< Column, 1 > get_used_columns() const
Returns the used columns of the data set.
void set_default_columns_uses()
const Tensor< type, 2 > & get_time_series_data() const
void print_input_target_columns_correlations() const
Print on screen the correlation between targets and inputs.
Tensor< type, 2 > get_testing_target_data() const
Tensor< Descriptives, 1 > scale_input_variables()
Tensor< Index, 1 > get_used_variables_indices() const
Returns the indices of the used variables.
void unuse_Tukey_outliers(const type &=type(1.5))
Tensor< Column, 1 > get_columns() const
Returns the columns of the data set.
Tensor< type, 1 > calculate_target_variables_maximums() const
Returns a vector containing the maximums of the target variables.
Index get_input_columns_number() const
Returns the number of columns whose uses are Input.
Tensor< type, 2 > get_training_target_data() const
Index get_used_variables_number() const
Returns the number of variables which are either input nor target.
Tensor< Column, 1 > get_target_columns() const
Returns the target columns of the data set.
void set_columns_uses(const Tensor< string, 1 > &)
const XMLElement * NextSiblingElement(const char *name=nullptr) const
Get the next(right) sibling element of this node, with an optionally supplied name.
void PushText(const char *text, bool cdata=false)
Add a text node.
void PushAttribute(const char *name, const char *value)
If streaming, add an attribute to an open element.
virtual void CloseElement(bool compactMode=false)
If streaming, close the Element.
uint32 sqrt(uint32 &r, int &exp)
uint32 log2(uint32 m, unsigned int n=32)
HALF_CONSTEXPR half abs(half arg)
HALF_CONSTEXPR bool isnan(half arg)
Index get_categories_number() const
Returns the number of categories.
virtual ~Column()
Destructor.
void set_categories_uses(const Tensor< string, 1 > &)
Tensor< string, 1 > categories
Categories within the column.
void add_category(const string &)
VariableUse column_use
Column use.
Tensor< VariableUse, 1 > categories_uses
Categories use.
void set_use(const VariableUse &)
void set_type(const string &)
Tensor< string, 1 > get_used_variables_names() const
Returns a string vector that contains the names of the used variables in the data set.
Index get_used_categories_number() const
Returns the number of used categories.
Column()
Default constructor.
DataSetBatch()
Default constructor.