9#include "quasi_newton_method.h"
77 case InverseHessianApproximationMethod::DFP:
80 case InverseHessianApproximationMethod::BFGS:
86 buffer <<
"OpenNN Exception: QuasiNewtonMethod class.\n"
87 <<
"string write_inverse_hessian_approximation_method() const method.\n"
88 <<
"Unknown inverse hessian approximation method.\n";
90 throw logic_error(buffer.str());
94const Index& QuasiNewtonMethod::get_epochs_number()
const
173 if(new_inverse_hessian_approximation_method_name ==
"DFP")
177 else if(new_inverse_hessian_approximation_method_name ==
"BFGS")
183 ostringstream buffer;
185 buffer <<
"OpenNN Exception: QuasiNewtonMethod class.\n"
186 <<
"void set_inverse_hessian_approximation_method(const string&) method.\n"
187 <<
"Unknown inverse hessian approximation method: " << new_inverse_hessian_approximation_method_name <<
".\n";
189 throw logic_error(buffer.str());
271 if(new_maximum_time <
static_cast<type
>(0.0))
273 ostringstream buffer;
275 buffer <<
"OpenNN Exception: QuasiNewtonMethod class.\n"
276 <<
"void set_maximum_time(const type&) method.\n"
277 <<
"Maximum time must be equal or greater than 0.\n";
279 throw logic_error(buffer.str());
290void QuasiNewtonMethod::initialize_inverse_hessian_approximation(
QuasiNewtonMehtodData& optimization_data)
const
292 optimization_data.inverse_hessian.setZero();
294 const Index parameters_number = optimization_data.inverse_hessian.dimension(0);
296 for(Index i = 0; i < parameters_number; i++) optimization_data.inverse_hessian(i,i) = type(1);
311 case InverseHessianApproximationMethod::DFP:
316 case InverseHessianApproximationMethod::BFGS:
322 ostringstream buffer;
324 buffer <<
"OpenNN Exception: QuasiNewtonMethod class.\n"
325 <<
"Tensor<type, 1> calculate_inverse_hessian_approximation(const Tensor<type, 1>&, "
326 "const Tensor<type, 1>&, const Tensor<type, 1>&, const Tensor<type, 1>&, const Tensor<type, 2>&) method.\n"
327 <<
"Unknown inverse hessian approximation method.\n";
329 throw logic_error(buffer.str());
337 auto ml = Eigen::Map<Eigen::Matrix<type,Eigen::Dynamic,Eigen::Dynamic,Eigen::RowMajor >>
338 (left_matrix.data(),left_matrix.dimension(0), 1);
340 auto mr = Eigen::Map<Eigen::Matrix<type,Eigen::Dynamic,Eigen::Dynamic,Eigen::RowMajor>>
341 (right_matrix.data(),right_matrix.dimension(0), 1);
345 auto product = kroneckerProduct(ml,mr).eval();
349 TensorMap< Tensor<type, 2> > direct_matrix(product.data(), left_matrix.size(), left_matrix.size());
351 return direct_matrix;
364 auto ml = Eigen::Map<Eigen::Matrix<type,Eigen::Dynamic,Eigen::Dynamic,Eigen::RowMajor >>
365 (left_matrix.data(),left_matrix.dimension(0),left_matrix.dimension(1));
367 auto mr = Eigen::Map<Eigen::Matrix<type,Eigen::Dynamic,Eigen::Dynamic,Eigen::RowMajor>>
368 (right_matrix.data(),right_matrix.dimension(0),right_matrix.dimension(1));
372 auto product = kroneckerProduct(ml,mr).eval();
376 TensorMap< Tensor<type, 2> > direct_matrix(product.data(), product.rows(), product.cols());
378 return direct_matrix;
394 Tensor<type, 0> parameters_difference_dot_gradient_difference;
396 parameters_difference_dot_gradient_difference.device(*thread_pool_device)
397 = optimization_data.parameters_difference.contract(optimization_data.gradient_difference, AT_B);
399 optimization_data.old_inverse_hessian_dot_gradient_difference.device(*thread_pool_device)
400 = optimization_data.old_inverse_hessian.contract(optimization_data.gradient_difference, A_B);
402 Tensor<type, 0> gradient_dot_hessian_dot_gradient;
404 gradient_dot_hessian_dot_gradient.device(*thread_pool_device)
405 = optimization_data.gradient_difference.contract(optimization_data.old_inverse_hessian_dot_gradient_difference, AT_B);
409 optimization_data.inverse_hessian = optimization_data.old_inverse_hessian;
411 optimization_data.inverse_hessian
412 +=
kronecker_product(optimization_data.parameters_difference, optimization_data.parameters_difference)
413 /parameters_difference_dot_gradient_difference(0);
415 optimization_data.inverse_hessian
416 -=
kronecker_product(optimization_data.old_inverse_hessian_dot_gradient_difference, optimization_data.old_inverse_hessian_dot_gradient_difference)
417 / gradient_dot_hessian_dot_gradient(0);
436 Tensor<type, 0> parameters_difference_dot_gradient_difference;
438 parameters_difference_dot_gradient_difference.device(*thread_pool_device)
439 = optimization_data.parameters_difference.contract(optimization_data.gradient_difference, AT_B);
442 optimization_data.old_inverse_hessian_dot_gradient_difference.device(*thread_pool_device)
443 = optimization_data.old_inverse_hessian.contract(optimization_data.gradient_difference, A_B);
445 Tensor<type, 0> gradient_dot_hessian_dot_gradient;
447 gradient_dot_hessian_dot_gradient.device(*thread_pool_device)
448 = optimization_data.gradient_difference.contract(optimization_data.old_inverse_hessian_dot_gradient_difference, AT_B);
450 Tensor<type, 1> BFGS(parameters_number);
452 BFGS.device(*thread_pool_device)
453 = optimization_data.parameters_difference/parameters_difference_dot_gradient_difference(0)
454 - optimization_data.old_inverse_hessian_dot_gradient_difference/gradient_dot_hessian_dot_gradient(0);
458 optimization_data.inverse_hessian = optimization_data.old_inverse_hessian;
460 optimization_data.inverse_hessian
461 +=
kronecker_product(optimization_data.parameters_difference, optimization_data.parameters_difference)
462 / parameters_difference_dot_gradient_difference(0);
464 optimization_data.inverse_hessian
465 -=
kronecker_product(optimization_data.old_inverse_hessian_dot_gradient_difference, optimization_data.old_inverse_hessian_dot_gradient_difference)
466 / gradient_dot_hessian_dot_gradient(0);
468 optimization_data.inverse_hessian
492 optimization_data.parameters_difference.device(*thread_pool_device)
493 = back_propagation.parameters - optimization_data.old_parameters;
495 optimization_data.gradient_difference.device(*thread_pool_device)
496 = back_propagation.gradient - optimization_data.old_gradient;
498 optimization_data.old_parameters = back_propagation.parameters;
502 if(optimization_data.epoch == 0
503 || is_zero(optimization_data.parameters_difference)
504 || is_zero(optimization_data.gradient_difference))
506 initialize_inverse_hessian_approximation(optimization_data);
513 optimization_data.training_direction.device(*thread_pool_device)
514 = -optimization_data.inverse_hessian.contract(back_propagation.gradient, A_B);
516 optimization_data.training_slope.device(*thread_pool_device)
517 = back_propagation.gradient.contract(optimization_data.training_direction, AT_B);
519 if(optimization_data.training_slope(0) >= type(0))
521 optimization_data.training_direction.device(*thread_pool_device) = -back_propagation.gradient;
526 optimization_data.epoch == 0
527 ? optimization_data.initial_learning_rate = first_learning_rate
528 : optimization_data.initial_learning_rate = optimization_data.old_learning_rate;
536 optimization_data.learning_rate = directional_point.first;
537 back_propagation.loss = directional_point.second;
539 if(
abs(optimization_data.learning_rate) > type(0))
541 optimization_data.parameters_increment.device(*thread_pool_device)
542 = optimization_data.training_direction*optimization_data.learning_rate;
544 back_propagation.parameters.device(*thread_pool_device) += optimization_data.parameters_increment;
548 const Index parameters_number = back_propagation.parameters.size();
550 for(Index i = 0; i < parameters_number; i++)
552 if(
abs(back_propagation.gradient(i)) < type(NUMERIC_LIMITS_MIN))
554 optimization_data.parameters_increment(i) = type(0);
556 else if(back_propagation.gradient(i) > type(0))
558 back_propagation.parameters(i) -= numeric_limits<type>::epsilon();
560 optimization_data.parameters_increment(i) = -numeric_limits<type>::epsilon();
562 else if(back_propagation.gradient(i) < type(0))
564 back_propagation.parameters(i) += numeric_limits<type>::epsilon();
566 optimization_data.parameters_increment(i) = numeric_limits<type>::epsilon();
570 optimization_data.learning_rate = optimization_data.initial_learning_rate;
575 optimization_data.old_gradient = back_propagation.gradient;
577 optimization_data.old_inverse_hessian = optimization_data.inverse_hessian;
579 optimization_data.old_learning_rate = optimization_data.learning_rate;
583 NeuralNetwork* neural_network_pointer = forward_propagation.neural_network_pointer;
585 neural_network_pointer->
set_parameters(back_propagation.parameters);
602 if(
display) cout <<
"Training with quasi-Newton method...\n";
617 const bool has_selection = data_set_pointer->has_selection();
628 const Tensor<Scaler, 1> input_variables_scalers = data_set_pointer->get_input_variables_scalers();
629 const Tensor<Scaler, 1> target_variables_scalers = data_set_pointer->get_target_variables_scalers();
631 Tensor<Descriptives, 1> input_variables_descriptives;
632 Tensor<Descriptives, 1> target_variables_descriptives;
649 scaling_layer_pointer->
set(input_variables_descriptives, input_variables_scalers);
657 unscaling_layer_pointer->
set(target_variables_descriptives, target_variables_scalers);
660 DataSetBatch training_batch(training_samples_number, data_set_pointer);
661 training_batch.fill(training_samples_indices, input_variables_indices, target_variables_indices);
663 DataSetBatch selection_batch(selection_samples_number, data_set_pointer);
664 selection_batch.fill(selection_samples_indices, input_variables_indices, target_variables_indices);
675 bool stop_training =
false;
677 Index selection_failures = 0;
679 type old_loss = type(0);
680 type loss_decrease = numeric_limits<type>::max();
682 time_t beginning_time, current_time;
683 time(&beginning_time);
694 optimization_data.epoch = epoch;
698 neural_network_pointer->
forward_propagate(training_batch, training_forward_propagation);
700 loss_index_pointer->back_propagate(training_batch, training_forward_propagation, training_back_propagation);
708 neural_network_pointer->
forward_propagate(selection_batch, selection_forward_propagation);
712 loss_index_pointer->calculate_errors(selection_batch, selection_forward_propagation, selection_back_propagation);
713 loss_index_pointer->calculate_error(selection_batch, selection_forward_propagation, selection_back_propagation);
721 elapsed_time =
static_cast<type
>(difftime(current_time, beginning_time));
725 cout <<
"Training error: " << training_back_propagation.error << endl;
726 if(has_selection) cout <<
"Selection error: " << selection_back_propagation.error << endl;
727 cout <<
"Learning rate: " << optimization_data.learning_rate << endl;
728 cout <<
"Elapsed time: " <<
write_time(elapsed_time) << endl;
731 if(epoch != 0) loss_decrease = old_loss - training_back_propagation.loss;
735 if(
display) cout <<
"Epoch " << epoch << endl <<
"Minimum loss decrease reached: " << loss_decrease << endl;
737 stop_training =
true;
739 results.
stopping_condition = OptimizationAlgorithm::StoppingCondition::MinimumLossDecrease;
742 old_loss = training_back_propagation.loss;
746 if(
display) cout <<
"Epoch " << epoch << endl <<
"Loss goal reached: " << training_back_propagation.loss << endl;
748 stop_training =
true;
754 if(
display) cout <<
"Epoch " << epoch << endl <<
"Maximum selection failures reached: " << selection_failures << endl;
756 stop_training =
true;
758 results.
stopping_condition = OptimizationAlgorithm::StoppingCondition::MaximumSelectionErrorIncreases;
762 if(
display) cout <<
"Epoch " << epoch << endl <<
"Maximum number of epochs reached: " << epoch << endl;
764 stop_training =
true;
766 results.
stopping_condition = OptimizationAlgorithm::StoppingCondition::MaximumEpochsNumber;
770 if(
display) cout <<
"Epoch " << epoch << endl <<
"Maximum training time reached: " <<
write_time(elapsed_time) << endl;
772 stop_training =
true;
790 if(stop_training)
break;
792 update_parameters(training_batch, training_forward_propagation, training_back_propagation, optimization_data);
806string QuasiNewtonMethod::write_optimization_algorithm_type()
const
808 return "QUASI_NEWTON_METHOD";
817 ostringstream buffer;
819 file_stream.OpenElement(
"QuasiNewtonMethod");
823 file_stream.OpenElement(
"InverseHessianApproximationMethod");
835 file_stream.OpenElement(
"MinimumLossDecrease");
840 file_stream.
PushText(buffer.str().c_str());
846 file_stream.OpenElement(
"LossGoal");
851 file_stream.
PushText(buffer.str().c_str());
857 file_stream.OpenElement(
"MaximumSelectionErrorIncreases");
862 file_stream.
PushText(buffer.str().c_str());
868 file_stream.OpenElement(
"MaximumEpochsNumber");
873 file_stream.
PushText(buffer.str().c_str());
879 file_stream.OpenElement(
"MaximumTime");
884 file_stream.
PushText(buffer.str().c_str());
890 file_stream.OpenElement(
"HardwareUse");
895 file_stream.
PushText(buffer.str().c_str());
907 Tensor<string, 2> labels_values(8, 2);
911 labels_values(0,0) =
"Inverse hessian approximation method";
916 labels_values(1,0) =
"Learning rate method";
921 labels_values(2,0) =
"Learning rate tolerance";
926 labels_values(3,0) =
"Minimum loss decrease";
931 labels_values(4,0) =
"Loss goal";
936 labels_values(5,0) =
"Maximum selection error increases";
941 labels_values(6,0) =
"Maximum epochs number";
946 labels_values(7,0) =
"Maximum time";
949 return labels_values;
959 ostringstream buffer;
961 buffer <<
"OpenNN Exception: QuasiNewtonMethod class.\n"
962 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
963 <<
"Quasi-Newton method element is nullptr.\n";
965 throw logic_error(buffer.str());
970 const tinyxml2::XMLElement* element = root_element->FirstChildElement(
"InverseHessianApproximationMethod");
974 const string new_inverse_hessian_approximation_method = element->GetText();
980 catch(
const logic_error& e)
982 cerr << e.what() << endl;
996 element_clone = element->DeepClone(&learning_rate_algorithm_document);
998 learning_rate_algorithm_document.InsertFirstChild(element_clone);
1010 const type new_minimum_loss_decrease =
static_cast<type
>(atof(element->GetText()));
1016 catch(
const logic_error& e)
1018 cerr << e.what() << endl;
1029 const type new_loss_goal =
static_cast<type
>(atof(element->GetText()));
1035 catch(
const logic_error& e)
1037 cerr << e.what() << endl;
1044 const tinyxml2::XMLElement* element = root_element->FirstChildElement(
"MaximumSelectionErrorIncreases");
1048 const Index new_maximum_selection_failures =
static_cast<Index
>(atoi(element->GetText()));
1054 catch(
const logic_error& e)
1056 cerr << e.what() << endl;
1067 const Index new_maximum_epochs_number =
static_cast<Index
>(atoi(element->GetText()));
1073 catch(
const logic_error& e)
1075 cerr << e.what() << endl;
1086 const type new_maximum_time =
static_cast<type
>(atof(element->GetText()));
1092 catch(
const logic_error& e)
1094 cerr << e.what() << endl;
1105 const string new_hardware_use = element->GetText();
1111 catch(
const logic_error& e)
1113 cerr << e.what() << endl;
This class represents the concept of data set for data modelling problems, such as approximation,...
Index get_training_samples_number() const
Returns the number of samples in the data set which will be used for training.
Tensor< Descriptives, 1 > scale_target_variables()
Tensor< Index, 1 > get_training_samples_indices() const
Returns the indices of the samples which will be used for training.
Tensor< Index, 1 > get_selection_samples_indices() const
Returns the indices of the samples which will be used for selection.
void unscale_input_variables(const Tensor< Descriptives, 1 > &)
Tensor< Index, 1 > get_target_variables_indices() const
Returns the indices of the target variables.
Index get_selection_samples_number() const
Returns the number of samples in the data set which will be used for selection.
void unscale_target_variables(const Tensor< Descriptives, 1 > &)
Tensor< string, 1 > get_target_variables_names() const
Tensor< Index, 1 > get_input_variables_indices() const
Returns the indices of the input variables.
Tensor< string, 1 > get_input_variables_names() const
Tensor< Descriptives, 1 > scale_input_variables()
A learning rate that is adjusted according to an algorithm during training to minimize training time.
void set_loss_index_pointer(LossIndex *)
void from_XML(const tinyxml2::XMLDocument &)
void set_default()
Sets the members of the learning rate algorithm to their default values.
string write_learning_rate_method() const
Returns a string with the name of the learning rate method to be used.
pair< type, type > calculate_directional_point(const DataSetBatch &, NeuralNetworkForwardPropagation &, LossIndexBackPropagation &, OptimizationAlgorithmData &) const
void write_XML(tinyxml2::XMLPrinter &) const
This abstract class represents the concept of loss index composed of an error term and a regularizati...
virtual string get_error_type() const
Returns a string with the default type of error term, "USER_PERFORMANCE_TERM".
NeuralNetwork * get_neural_network_pointer() const
Returns a pointer to the neural network object associated to the error term.
DataSet * get_data_set_pointer() const
Returns a pointer to the data set object associated to the error term.
ScalingLayer * get_scaling_layer_pointer() const
Returns a pointer to the scaling layers object composing this neural network object.
bool has_scaling_layer() const
bool has_unscaling_layer() const
void forward_propagate(const DataSetBatch &, NeuralNetworkForwardPropagation &) const
Calculate forward propagation in neural network.
void save(const string &) const
void set_parameters(Tensor< type, 1 > &)
UnscalingLayer * get_unscaling_layer_pointer() const
Returns a pointer to the unscaling layers object composing this neural network object.
void set_inputs_names(const Tensor< string, 1 > &)
Index get_parameters_number() const
void set_outputs_names(const Tensor< string, 1 > &)
string neural_network_file_name
Path where the neural network is saved.
void set_hardware_use(const string &)
Set hardware to use. Default: Multi-core.
LossIndex * loss_index_pointer
Pointer to a loss index for a neural network object.
virtual void check() const
bool display
Display messages to screen.
const string write_time(const type &) const
Writes the time from seconds in format HH:mm:ss.
Index save_period
Number of iterations between the training saving progress.
string hardware_use
Hardware use.
Index epochs_number
Number of training epochs in the neural network.
Index display_period
Number of iterations between the training showing progress.
TrainingResults perform_training()
void set_maximum_selection_failures(const Index &)
void update_parameters(const DataSetBatch &batch, NeuralNetworkForwardPropagation &forward_propagation, LossIndexBackPropagation &back_propagation, QuasiNewtonMehtodData &optimization_data)
QuasiNewtonMethod::update_parameters.
const InverseHessianApproximationMethod & get_inverse_hessian_approximation_method() const
Returns the method for approximating the inverse hessian matrix to be used when training.
void set_loss_index_pointer(LossIndex *)
const type & get_maximum_time() const
Returns the maximum training time.
const type & get_loss_goal() const
void from_XML(const tinyxml2::XMLDocument &)
void calculate_DFP_inverse_hessian(QuasiNewtonMehtodData &) const
void set_default()
Sets the members of the optimization algorithm object to their default values.
InverseHessianApproximationMethod inverse_hessian_approximation_method
Variable containing the actual method used to obtain a suitable learning rate.
const Index & get_maximum_epochs_number() const
Returns the maximum number of epochs for training.
const Tensor< type, 2 > kronecker_product(Tensor< type, 2 > &, Tensor< type, 2 > &) const
Tensor< string, 2 > to_string_matrix() const
Writes as matrix of strings the most representative atributes.
type minimum_loss_decrease
Minimum loss improvement between two successive epochs. It is used as a stopping criterion.
LearningRateAlgorithm * get_learning_rate_algorithm_pointer()
Returns a pointer to the learning rate algorithm object inside the quasi-Newton method object.
const LearningRateAlgorithm & get_learning_rate_algorithm() const
Returns a constant reference to the learning rate algorithm object inside the quasi-Newton method obj...
void set_maximum_time(const type &)
void set_inverse_hessian_approximation_method(const InverseHessianApproximationMethod &)
void calculate_BFGS_inverse_hessian(QuasiNewtonMehtodData &) const
InverseHessianApproximationMethod
Enumeration of the available training operators for obtaining the approximation to the inverse hessia...
LearningRateAlgorithm learning_rate_algorithm
void set_loss_goal(const type &)
type maximum_time
Maximum training time. It is used as a stopping criterion.
void set_maximum_epochs_number(const Index &)
void set_minimum_loss_decrease(const type &)
void calculate_inverse_hessian_approximation(QuasiNewtonMehtodData &) const
string write_inverse_hessian_approximation_method() const
Returns the name of the method for the approximation of the inverse hessian.
type training_loss_goal
Goal value for the loss. It is used as a stopping criterion.
Index maximum_epochs_number
Maximum number of epochs to perform_training. It is used as a stopping criterion.
void set_display(const bool &)
void write_XML(tinyxml2::XMLPrinter &) const
virtual ~QuasiNewtonMethod()
const Index & get_maximum_selection_failures() const
Returns the maximum number of selection error increases during the training process.
Index maximum_selection_failures
const type & get_minimum_loss_decrease() const
Returns the minimum loss improvement during training.
This class represents a layer of scaling neurons.
void set()
Sets the scaling layer to be empty.
This class represents a layer of unscaling neurons.
void set()
Sets the unscaling layer to be empty.
void PushText(const char *text, bool cdata=false)
Add a text node.
virtual void CloseElement(bool compactMode=false)
If streaming, close the Element.
HALF_CONSTEXPR half abs(half arg)
This structure contains the optimization algorithm results.
Tensor< type, 1 > selection_error_history
History of the selection error over the training iterations.
void resize_training_error_history(const Index &)
Resizes the training error history keeping the values.
OptimizationAlgorithm::StoppingCondition stopping_condition
Stopping condition of the algorithm.
void resize_selection_error_history(const Index &)
Resizes the selection error history keeping the values.
Tensor< type, 1 > training_error_history
History of the loss function loss over the training iterations.
string elapsed_time
Elapsed time of the training process.