This class represents the concept of data set for data modelling problems, such as approximation, classification or forecasting. More...

#include <data_set.h>

Classes
struct	Column
	This structure represents the columns of the DataSet. More...

Public Types
enum class	Separator { Space , Tab , Comma , Semicolon }
	Enumeration of available separators for the data file. More...

enum class	MissingValuesMethod { Unuse , Mean , Median }
	Enumeration of available methods for missing values in the data. More...

enum class	ProjectType { Approximation , Classification , Forecasting , ImageClassification }
	Enumeration of the learning tasks. More...

enum class	SampleUse { Training , Selection , Testing , UnusedSample }

enum class	VariableUse { Id , Input , Target , Time , UnusedVariable }

enum class	ColumnType { Numeric , Binary , Categorical , DateTime , Constant }

Public Member Functions
	DataSet ()

	DataSet (const Tensor< type, 2 > &)

	DataSet (const Index &, const Index &)

	DataSet (const Index &, const Index &, const Index &)

	DataSet (const string &, const char &, const bool &)

virtual	~DataSet ()
	Destructor. More...

Index	get_samples_number () const

Index	get_training_samples_number () const
	Returns the number of samples in the data set which will be used for training. More...

Index	get_selection_samples_number () const
	Returns the number of samples in the data set which will be used for selection. More...

Index	get_testing_samples_number () const
	Returns the number of samples in the data set which will be used for testing. More...

Index	get_used_samples_number () const

Index	get_unused_samples_number () const

Tensor< Index, 1 >	get_training_samples_indices () const
	Returns the indices of the samples which will be used for training. More...

Tensor< Index, 1 >	get_selection_samples_indices () const
	Returns the indices of the samples which will be used for selection. More...

Tensor< Index, 1 >	get_testing_samples_indices () const
	Returns the indices of the samples which will be used for testing. More...

Tensor< Index, 1 >	get_used_samples_indices () const
	Returns the indices of the used samples(those which are not set unused). More...

Tensor< Index, 1 >	get_unused_samples_indices () const
	Returns the indices of the samples set unused. More...

SampleUse	get_sample_use (const Index &) const

const Tensor< SampleUse, 1 > &	get_samples_uses () const
	Returns the use of every sample (training, selection, testing or unused) in a vector. More...

Tensor< Index, 1 >	get_samples_uses_numbers () const

Tensor< type, 1 >	get_samples_uses_percentages () const

string	get_sample_string (const Index &, const string &=",") const

Tensor< Column, 1 >	get_columns () const
	Returns the columns of the data set. More...

Tensor< Column, 1 >	get_time_series_columns () const

Index	get_time_series_data_rows_number () const

Tensor< Column, 1 >	get_input_columns () const
	Returns the input columns of the data set. More...

Tensor< bool, 1 >	get_input_columns_binary () const
	Returns the input columns of the data set. More...

Tensor< Column, 1 >	get_target_columns () const
	Returns the target columns of the data set. More...

Tensor< Column, 1 >	get_used_columns () const
	Returns the used columns of the data set. More...

Index	get_columns_number () const
	Returns the number of columns in the data set. More...

Index	get_input_columns_number () const
	Returns the number of columns whose uses are Input. More...

Index	get_input_time_series_columns_number () const

Index	get_target_columns_number () const
	Returns the number of columns whose uses are Target. More...

Index	get_target_time_series_columns_number () const

Index	get_time_columns_number () const
	Returns the number of columns whose uses are Time. More...

Index	get_unused_columns_number () const
	Returns the number of columns that are not used. More...

Index	get_used_columns_number () const
	Returns the number of columns that are used. More...

Index	get_column_index (const string &) const

Index	get_column_index (const Index &) const

Tensor< Index, 1 >	get_input_columns_indices () const
	Returns a indices vector with the positions of the inputs. More...

Tensor< Index, 1 >	get_input_time_series_columns_indices () const

Tensor< Index, 1 >	get_target_columns_indices () const
	Returns a indices vector with the positions of the targets. More...

Tensor< Index, 1 >	get_target_time_series_columns_indices () const

Tensor< Index, 1 >	get_unused_columns_indices () const
	Returns a indices vector with the positions of the unused columns. More...

Tensor< Index, 1 >	get_used_columns_indices () const
	Returns a indices vector with the positions of the used columns. More...

Tensor< string, 1 >	get_columns_names () const
	Returns a string vector that contains the names of the columns. More...

Tensor< string, 1 >	get_input_columns_names () const
	Returns a string vector that contains the names of the columns whose uses are Input. More...

Tensor< string, 1 >	get_target_columns_names () const
	Returns a string vector which contains the names of the columns whose uses are Target. More...

Tensor< string, 1 >	get_used_columns_names () const
	Returns a string vector which contains the names of the columns used whether Input, Target or Time. More...

ColumnType	get_column_type (const Index &index) const

VariableUse	get_column_use (const Index &) const
	Returns a vector containing the use of the column, without taking into account the categories. More...

Tensor< VariableUse, 1 >	get_columns_uses () const
	Returns the uses of each columns of the data set. More...

Index	get_variables_number () const
	Returns the number of variables in the data set. More...

Index	get_time_series_variables_number () const
	Returns the number of variables in the time series data. More...

Index	get_input_variables_number () const

Index	get_target_variables_number () const
	Returns the number of target variables of the data set. More...

Index	get_unused_variables_number () const
	Returns the number of variables which will neither be used as input nor as target. More...

Index	get_used_variables_number () const
	Returns the number of variables which are either input nor target. More...

string	get_variable_name (const Index &) const

Tensor< string, 1 >	get_variables_names () const

Tensor< string, 1 >	get_time_series_variables_names () const

Tensor< string, 1 >	get_input_variables_names () const

Tensor< string, 1 >	get_target_variables_names () const

Index	get_variable_index (const string &name) const

Tensor< Index, 1 >	get_variable_indices (const Index &) const

Tensor< Index, 1 >	get_unused_variables_indices () const
	Returns the indices of the unused variables. More...

Tensor< Index, 1 >	get_used_variables_indices () const
	Returns the indices of the used variables. More...

Tensor< Index, 1 >	get_input_variables_indices () const
	Returns the indices of the input variables. More...

Tensor< Index, 1 >	get_target_variables_indices () const
	Returns the indices of the target variables. More...

VariableUse	get_variable_use (const Index &) const

Tensor< VariableUse, 1 >	get_variables_uses () const

const Tensor< Index, 1 > &	get_input_variables_dimensions () const
	Returns the dimensions of the input variables. More...

Index	get_input_variables_rank () const

Tensor< Scaler, 1 >	get_columns_scalers () const

Tensor< Scaler, 1 >	get_input_variables_scalers () const

Tensor< Scaler, 1 >	get_target_variables_scalers () const

Tensor< Index, 2 >	get_batches (const Tensor< Index, 1 > &, const Index &, const bool &, const Index &buffer_size=100) const

const Tensor< type, 2 > &	get_data () const

Tensor< type, 2 > *	get_data_pointer ()

const Tensor< type, 2 > &	get_time_series_data () const

Tensor< type, 2 >	get_training_data () const

Tensor< type, 2 >	get_selection_data () const

Tensor< type, 2 >	get_testing_data () const

Tensor< string, 1 >	get_time_series_columns_names () const

Index	get_time_series_columns_number () const
	Returns the number of columns in the time series. More...

Tensor< type, 2 >	get_input_data () const

Tensor< type, 2 >	get_target_data () const

Tensor< type, 2 >	get_input_data (const Tensor< Index, 1 > &) const

Tensor< type, 2 >	get_target_data (const Tensor< Index, 1 > &) const

Tensor< type, 2 >	get_training_input_data () const

Tensor< type, 2 >	get_training_target_data () const

Tensor< type, 2 >	get_selection_input_data () const

Tensor< type, 2 >	get_selection_target_data () const

Tensor< type, 2 >	get_testing_input_data () const

Tensor< type, 2 >	get_testing_target_data () const

Tensor< type, 1 >	get_sample_data (const Index &) const

Tensor< type, 1 >	get_sample_data (const Index &, const Tensor< Index, 1 > &) const

Tensor< type, 2 >	get_sample_input_data (const Index &) const

Tensor< type, 2 >	get_sample_target_data (const Index &) const

Tensor< type, 2 >	get_column_data (const Index &) const

Tensor< type, 2 >	get_column_data (const Index &, const Tensor< Index, 1 > &) const

Tensor< type, 2 >	get_column_data (const Tensor< Index, 1 > &) const

Tensor< type, 2 >	get_column_data (const string &) const

Tensor< type, 1 >	get_variable_data (const Index &) const

Tensor< type, 1 >	get_variable_data (const string &) const

Tensor< type, 1 >	get_variable_data (const Index &, const Tensor< Index, 1 > &) const

Tensor< type, 1 >	get_variable_data (const string &, const Tensor< Index, 1 > &) const

Tensor< Tensor< string, 1 >, 1 >	get_data_file_preview () const

Tensor< type, 2 >	get_subtensor_data (const Tensor< Index, 1 > &, const Tensor< Index, 1 > &) const

MissingValuesMethod	get_missing_values_method () const
	Returns a string with the method used. More...

const string &	get_data_file_name () const
	Returns the name of the data file. More...

const bool &	get_header_line () const
	Returns true if the first line of the data file has a header with the names of the variables, and false otherwise. More...

const bool &	get_rows_label () const
	Returns true if the data file has rows label, and false otherwise. More...

Tensor< string, 1 >	get_rows_label_tensor () const

Tensor< string, 1 >	get_selection_rows_label_tensor ()

Tensor< string, 1 >	get_testing_rows_label_tensor ()

const Separator &	get_separator () const
	Returns the separator to be used in the data file. More...

char	get_separator_char () const
	Returns the string which will be used as separator in the data file. More...

string	get_separator_string () const
	Returns the string which will be used as separator in the data file. More...

const string &	get_missing_values_label () const
	Returns the string which will be used as label for the missing values in the data file. More...

const Index &	get_lags_number () const
	Returns the number of lags to be used in a time series prediction application. More...

const Index &	get_steps_ahead () const
	Returns the number of steps ahead to be used in a time series prediction application. More...

const string &	get_time_column () const
	Returns the indices of the time variables in the data set. More...

Index	get_time_series_time_column_index () const

Index	get_gmt () const

const bool &	get_display () const

void	set ()
	Sets zero samples and zero variables in the data set. More...

void	set (const Tensor< type, 2 > &)

void	set (const Index &, const Index &)

void	set (const Index &, const Index &, const Index &)

void	set (const DataSet &)

void	set (const tinyxml2::XMLDocument &)

void	set (const string &)

void	set (const string &, const char &, const bool &)

void	set_default ()

void	set_threads_number (const int &)

void	set_samples_number (const Index &)

void	set_training ()
	Sets all the samples in the data set for training. More...

void	set_selection ()
	Sets all the samples in the data set for selection. More...

void	set_testing ()
	Sets all the samples in the data set for testing. More...

void	set_training (const Tensor< Index, 1 > &)

void	set_selection (const Tensor< Index, 1 > &)

void	set_testing (const Tensor< Index, 1 > &)

void	set_samples_unused ()
	Sets all the samples in the data set for unused. More...

void	set_samples_unused (const Tensor< Index, 1 > &)

void	set_sample_use (const Index &, const SampleUse &)

void	set_sample_use (const Index &, const string &)

void	set_samples_uses (const Tensor< SampleUse, 1 > &)

void	set_samples_uses (const Tensor< string, 1 > &)

void	set_columns (const Tensor< Column, 1 > &)

void	set_default_columns_uses ()

void	set_default_columns_names ()

void	set_column_name (const Index &, const string &)

void	set_columns_uses (const Tensor< string, 1 > &)

void	set_columns_uses (const Tensor< VariableUse, 1 > &)

void	set_columns_unused ()
	Sets all columns in the data_set as unused columns. More...

void	set_input_target_columns (const Tensor< Index, 1 > &, const Tensor< Index, 1 > &)

void	set_input_columns_unused ()
	Sets all input columns in the data_set as unused columns. More...

void	set_input_columns (const Tensor< Index, 1 > &, const Tensor< bool, 1 > &)

void	set_column_use (const Index &, const VariableUse &)

void	set_column_use (const string &, const VariableUse &)

void	set_column_type (const Index &, const ColumnType &)

void	set_column_type (const string &, const ColumnType &)

void	set_columns_names (const Tensor< string, 1 > &)

void	set_columns_number (const Index &)

void	set_columns_scalers (const Scaler &)

void	set_binary_simple_columns ()

void	check_constant_columns ()

Tensor< type, 2 >	transform_binary_column (const Tensor< type, 1 > &) const

void	set_variables_names (const Tensor< string, 1 > &)

void	set_variable_name (const Index &, const string &)

void	set_input ()
	Sets all the variables in the data set as input variables. More...

void	set_target ()
	Sets all the variables in the data set as target variables. More...

void	set_variables_unused ()
	Sets all the variables in the data set as unused variables. More...

void	set_input_variables_dimensions (const Tensor< Index, 1 > &)
	Sets new input dimensions in the data set. More...

void	set_data (const Tensor< type, 2 > &)

void	set_data_file_name (const string &)

void	set_has_columns_names (const bool &)
	Sets if the data file contains a header with the names of the columns. More...

void	set_has_rows_label (const bool &)
	Sets if the data file contains rows label. More...

void	set_separator (const Separator &)

void	set_separator (const string &)

void	set_separator (const char &)

void	set_missing_values_label (const string &)

void	set_missing_values_method (const MissingValuesMethod &)

void	set_missing_values_method (const string &)

void	set_lags_number (const Index &)

void	set_steps_ahead_number (const Index &)

void	set_time_column (const string &)

void	set_gmt (Index &)

void	set_display (const bool &)

bool	is_empty () const
	Returns true if the data matrix is empty, and false otherwise. More...

bool	is_sample_used (const Index &) const

bool	is_sample_unused (const Index &) const

bool	has_binary_columns () const

bool	has_categorical_columns () const

bool	has_time_columns () const

bool	has_time_time_series_columns () const

bool	has_selection () const

void	split_samples_sequential (const type &training_ratio=static_cast< type >(0.6), const type &selection_ratio=static_cast< type >(0.2), const type &testing_ratio=static_cast< type >(0.2))

void	split_samples_random (const type &training_ratio=static_cast< type >(0.6), const type &selection_ratio=static_cast< type >(0.2), const type &testing_ratio=static_cast< type >(0.2))

Tensor< string, 1 >	unuse_constant_columns ()

Tensor< Index, 1 >	unuse_repeated_samples ()

Tensor< string, 1 >	unuse_uncorrelated_columns (const type &=type(0.25))

void	set_data_constant (const type &)

void	set_data_random ()

void	set_data_binary_random ()

Tensor< Descriptives, 1 >	calculate_variables_descriptives () const

Tensor< Descriptives, 1 >	calculate_used_variables_descriptives () const

Tensor< Descriptives, 1 >	calculate_columns_descriptives_positive_samples () const
	Calculate the descriptives of the samples with positive targets in binary classification problems. More...

Tensor< Descriptives, 1 >	calculate_columns_descriptives_negative_samples () const
	Calculate the descriptives of the samples with neagtive targets in binary classification problems. More...

Tensor< Descriptives, 1 >	calculate_columns_descriptives_categories (const Index &) const

Tensor< Descriptives, 1 >	calculate_columns_descriptives_training_samples () const

Tensor< Descriptives, 1 >	calculate_columns_descriptives_selection_samples () const

Tensor< Descriptives, 1 >	calculate_input_variables_descriptives () const

Tensor< Descriptives, 1 >	calculate_target_variables_descriptives () const

Tensor< Descriptives, 1 >	calculate_testing_target_variables_descriptives () const

Tensor< type, 1 >	calculate_input_variables_minimums () const
	Returns a vector containing the minimums of the input variables. More...

Tensor< type, 1 >	calculate_target_variables_minimums () const
	Returns a vector containing the minimums of the target variables. More...

Tensor< type, 1 >	calculate_input_variables_maximums () const
	Returns a vector containing the maximums of the input variables. More...

Tensor< type, 1 >	calculate_target_variables_maximums () const
	Returns a vector containing the maximums of the target variables. More...

Tensor< type, 1 >	calculate_variables_means (const Tensor< Index, 1 > &) const

Tensor< type, 1 >	calculate_used_variables_minimums () const
	Returns a vector containing the maximum of the used variables. More...

Tensor< type, 1 >	calculate_used_targets_mean () const

Tensor< type, 1 >	calculate_selection_targets_mean () const
	Returns the mean values of the target variables on the selection. More...

Index	calculate_used_negatives (const Index &) const

Index	calculate_training_negatives (const Index &) const

Index	calculate_selection_negatives (const Index &) const

Index	calculate_testing_negatives (const Index &) const

Tensor< Histogram, 1 >	calculate_columns_distribution (const Index &=10) const

Tensor< BoxPlot, 1 >	calculate_columns_box_plots () const

Tensor< Correlation, 2 >	calculate_input_columns_correlations () const

void	print_inputs_correlations () const
	Print on screen the correlation between variables in the data set. More...

void	print_top_inputs_correlations () const

Tensor< Correlation, 2 >	calculate_input_target_columns_correlations () const

void	print_input_target_columns_correlations () const
	Print on screen the correlation between targets and inputs. More...

void	print_top_input_target_columns_correlations () const

Tensor< Index, 1 >	filter_data (const Tensor< type, 1 > &, const Tensor< type, 1 > &)

void	set_default_columns_scalers ()

Tensor< Descriptives, 1 >	scale_data ()

Tensor< Descriptives, 1 >	scale_input_variables ()

Tensor< Descriptives, 1 >	scale_target_variables ()

void	unscale_data (const Tensor< Descriptives, 1 > &)

void	unscale_input_variables (const Tensor< Descriptives, 1 > &)

void	unscale_target_variables (const Tensor< Descriptives, 1 > &)

Tensor< Index, 1 >	calculate_target_distribution () const

Tensor< Tensor< Index, 1 >, 1 >	calculate_Tukey_outliers (const type &=type(1.5)) const

void	unuse_Tukey_outliers (const type &=type(1.5))

Tensor< Index, 1 >	calculate_local_outlier_factor_outliers (const Index &=20, const Index &=0, const type &=type(0)) const

void	unuse_local_outlier_factor_outliers (const Index &=20, const type &=type(1.5))

Tensor< Index, 1 >	calculate_isolation_forest_outliers (const Index &=100, const Index &=256, const type &=type(0)) const

void	unuse_isolation_forest_outliers (const Index &=20, const type &=type(1.5))

void	transform_time_series ()
	Arranges an input-target DataSet from a time series matrix, according to the number of lags. More...

void	transform_time_series_columns ()
	This method transforms the columns into time series for forecasting problems. More...

void	transform_time_series_data ()

void	get_time_series_columns_number (const Index &)

void	set_time_series_data (const Tensor< type, 2 > &)

void	set_time_series_columns_number (const Index &)

Tensor< type, 2 >	get_time_series_column_data (const Index &) const

Tensor< type, 2 >	calculate_autocorrelations (const Index &=10) const

Tensor< type, 3 >	calculate_cross_correlations (const Index &=10) const
	Calculates the cross-correlation between all the variables in the data set. More...

void	generate_constant_data (const Index &, const Index &, const type &)

void	generate_random_data (const Index &, const Index &)

void	generate_sequential_data (const Index &, const Index &)

void	generate_Rosenbrock_data (const Index &, const Index &)

void	generate_sum_data (const Index &, const Index &)

void	print () const
	Prints to the screen in text format the main numbers from the data set object. More...

void	from_XML (const tinyxml2::XMLDocument &)

void	write_XML (tinyxml2::XMLPrinter &) const
	Serializes the data set object into a XML document of the TinyXML library without keep the DOM tree in memory. More...

void	save (const string &) const

void	load (const string &)

void	print_columns () const

void	print_columns_types () const

void	print_columns_uses () const

void	print_data () const
	Prints to the screen the values of the data matrix. More...

void	print_data_preview () const

void	print_data_file_preview () const

void	save_data () const
	Saves to the data file the values of the data matrix. More...

void	save_data_binary (const string &) const
	Saves to the data file the values of the data matrix in binary format. More...

void	save_time_series_data_binary (const string &) const
	Saves to the data file the values of the time series data matrix in binary format. More...

void	read_csv ()

void	load_data_binary ()
	This method loads the data from a binary data file. More...

void	load_time_series_data_binary (const string &)
	This method loads time series data from a binary data. More...

void	check_input_csv (const string &, const char &) const
	This method checks if the input data file has the correct format. Returns an error message. More...

Tensor< type, 2 >	read_input_csv (const string &, const char &, const string &, const bool &, const bool &) const
	This method loads data from a file and returns a matrix containing the input columns. More...

void	fill_time_series (const Index &)

bool	has_nan () const
	Returns true if the data contain missing values. More...

bool	has_nan_row (const Index &) const
	Returns true if the given row contains missing values. More...

void	print_missing_values_information () const

void	impute_missing_values_unuse ()
	Sets all the samples with missing values to "Unused". More...

void	impute_missing_values_mean ()
	Substitutes all the missing values by the mean of the corresponding variable. More...

void	impute_missing_values_median ()
	Substitutes all the missing values by the median of the corresponding variable. More...

void	scrub_missing_values ()

Tensor< Index, 1 >	count_nan_columns () const

Index	count_rows_with_nan () const

Index	count_nan () const

void	set_missing_values_number (const Index &)

void	set_missing_values_number ()

void	set_columns_missing_values_number (const Tensor< Index, 1 > &)

void	set_columns_missing_values_number ()

void	set_rows_missing_values_number (const Index &)

void	set_rows_missing_values_number ()

void	fix_repeated_names ()

Tensor< Index, 1 >	push_back (const Tensor< Index, 1 > &, const Index &) const

Tensor< string, 1 >	push_back (const Tensor< string, 1 > &, const string &) const

void	initialize_sequential (Tensor< Index, 1 > &, const Index &, const Index &, const Index &) const

void	intialize_sequential (Tensor< type, 1 > &, const type &, const type &, const type &) const

Tensor< Index, 2 >	split_samples (const Tensor< Index, 1 > &, const Index &) const

bool	get_has_rows_labels () const

void	shuffle ()

void	read_csv_1 ()

void	read_csv_2_simple ()

void	read_csv_3_simple ()

void	read_csv_2_complete ()

void	read_csv_3_complete ()

void	check_separators (const string &) const

void	check_special_characters (const string &) const

Static Public Member Functions
static Tensor< string, 1 >	get_default_columns_names (const Index &)

static Scaler	get_scaling_unscaling_method (const string &)

Private Member Functions
Tensor< Index, 1 >	select_outliers_via_standard_deviation (const Tensor< type, 1 > &, const type &=type(2.0), bool=true) const

Tensor< Index, 1 >	select_outliers_via_contamination (const Tensor< type, 1 > &, const type &=type(0.05), bool=true) const

type	calculate_euclidean_distance (const Tensor< Index, 1 > &, const Index &, const Index &) const

Tensor< type, 2 >	calculate_distance_matrix (const Tensor< Index, 1 > &) const

Tensor< list< Index >, 1 >	calculate_k_nearest_neighbors (const Tensor< type, 2 > &, const Index &=20) const

Tensor< Tensor< type, 1 >, 1 >	get_kd_tree_data () const

Tensor< Tensor< Index, 1 >, 1 >	create_bounding_limits_kd_tree (const Index &) const

void	create_kd_tree (Tensor< Tensor< type, 1 >, 1 > &, const Tensor< Tensor< Index, 1 >, 1 > &) const

Tensor< list< Index >, 1 >	calculate_bounding_boxes_neighbors (const Tensor< Tensor< type, 1 >, 1 > &, const Tensor< Index, 1 > &, const Index &, const Index &) const

Tensor< list< Index >, 1 >	calculate_kd_tree_neighbors (const Index &=20, const Index &=40) const

Tensor< type, 1 >	calculate_average_reachability (Tensor< list< Index >, 1 > &, const Index &) const

Tensor< type, 1 >	calculate_local_outlier_factor (Tensor< list< Index >, 1 > &, const Tensor< type, 1 > &, const Index &) const

void	calculate_min_max_indices_list (list< Index > &, const Index &, type &, type &) const

Index	split_isolation_tree (Tensor< type, 2 > &, list< list< Index > > &, list< Index > &) const

Tensor< type, 2 >	create_isolation_tree (const Tensor< Index, 1 > &, const Index &) const

Tensor< Tensor< type, 2 >, 1 >	create_isolation_forest (const Index &, const Index &, const Index &) const

type	calculate_tree_path (const Tensor< type, 2 > &, const Index &, const Index &) const

Tensor< type, 1 >	calculate_average_forest_paths (const Tensor< Tensor< type, 2 >, 1 > &, const Index &) const

Private Attributes
NonBlockingThreadPool *	non_blocking_thread_pool = nullptr

ThreadPoolDevice *	thread_pool_device = nullptr

Tensor< type, 2 >	data

Tensor< SampleUse, 1 >	samples_uses

Tensor< string, 1 >	rows_labels

Tensor< Column, 1 >	columns

Tensor< Index, 1 >	input_variables_dimensions

string	data_file_name
	Data file name. More...

Separator	separator = Separator::Comma
	Separator character. More...

string	missing_values_label = "NA"
	Missing values label. More...

bool	has_columns_names = false
	Header which contains variables name. More...

bool	has_rows_labels = false
	Header which contains the rows label. More...

Tensor< Tensor< string, 1 >, 1 >	data_file_preview

string	time_column
	Index where time variable is located for forecasting applications. More...

Index	lags_number = 0
	Number of lags. More...

Index	steps_ahead = 0
	Number of steps ahead. More...

Tensor< type, 2 >	time_series_data

Tensor< Column, 1 >	time_series_columns

Index	gmt = 0

MissingValuesMethod	missing_values_method = MissingValuesMethod::Unuse
	Missing values method. More...

Index	missing_values_number
	Missing values. More...

Tensor< Index, 1 >	columns_missing_values_number

Index	rows_missing_values_number

bool	display = true
	Display messages to screen. More...

const Eigen::array< IndexPair< Index >, 1 >	product_vector_vector = {IndexPair<Index>(0, 0)}

Detailed Description

This class represents the concept of data set for data modelling problems, such as approximation, classification or forecasting.

It basically consists of a data Matrix separated by columns. These columns can take different categories depending on the data hosted in them.

With OpenNN DataSet class you can edit the data to prepare your model, such as eliminating missing values, calculating correlations between variables (inputs and targets), not using certain variables or samples, etc \dots.

Definition at line 56 of file data_set.h.

Member Enumeration Documentation

◆ ColumnType

enum class ColumnType

strong

This enumeration represents the data type of a column (numeric, binary, categorical or time).

Definition at line 104 of file data_set.h.

◆ MissingValuesMethod

enum class MissingValuesMethod

strong

Enumeration of available methods for missing values in the data.

Definition at line 85 of file data_set.h.

◆ ProjectType

enum class ProjectType

strong

Enumeration of the learning tasks.

Definition at line 89 of file data_set.h.

◆ SampleUse

enum class SampleUse

strong

This enumeration represents the possible uses of an sample (training, selection, testing or unused).

Definition at line 94 of file data_set.h.

◆ Separator

enum class Separator

strong

Enumeration of available separators for the data file.

Definition at line 81 of file data_set.h.

◆ VariableUse

enum class VariableUse

strong

This enumeration represents the possible uses of an variable (input, target, time or unused).

Definition at line 99 of file data_set.h.

Constructor & Destructor Documentation

◆ DataSet() [1/5]

DataSet ( )

explicit

Default constructor. It creates a data set object with zero samples and zero inputs and target variables. It also initializes the rest of class members to their default values.

Definition at line 20 of file data_set.cpp.

◆ DataSet() [2/5]

DataSet ( const Tensor< type, 2 > & data )

explicit

Default constructor. It creates a data set object from data Eigen Matrix. It also initializes the rest of class members to their default values.

Parameters

data	Data Tensor<type, 2>.

Definition at line 32 of file data_set.cpp.

◆ DataSet() [3/5]

DataSet	(	const Index &	new_samples_number,
		const Index &	new_variables_number
	)

explicit

Samples and variables number constructor. It creates a data set object with given samples and variables numbers. All the variables are set as inputs. It also initializes the rest of class members to their default values.

Parameters

new_samples_number	Number of samples in the data set.
new_variables_number	Number of variables.

Definition at line 47 of file data_set.cpp.

◆ DataSet() [4/5]

DataSet	(	const Index &	new_samples_number,
		const Index &	new_inputs_number,
		const Index &	new_targets_number
	)

explicit

Samples number, input variables number and target variables number constructor. It creates a data set object with given samples and inputs and target variables numbers. It also initializes the rest of class members to their default values.

Parameters

new_samples_number	Number of samples in the data set.
new_inputs_number	Number of input variables.
new_targets_number	Number of target variables.

Definition at line 62 of file data_set.cpp.

◆ DataSet() [5/5]

DataSet	(	const string &	data_file_name,
		const char &	separator,
		const bool &	has_columns_names
	)

explicit

File and separator constructor. It creates a data set object by loading the object members from a data file. It also sets a separator. Please mind about the file format. This is specified in the User's Guide.

Parameters

data_file_name	Data file file name.
separator	Data file file name.

Definition at line 76 of file data_set.cpp.

◆ ~DataSet()

~DataSet ( )

virtual

Destructor.

Definition at line 84 of file data_set.cpp.

Member Function Documentation

◆ calculate_autocorrelations()

Tensor< type, 2 > calculate_autocorrelations ( const Index & lags_number = 10 ) const

Returns a matrix with the values of autocorrelation for every variable in the data set. The number of rows is equal to the number of The number of columns is the maximum lags number.

Parameters

maximum_lags_number Maximum lags number for which autocorrelation is calculated.

Definition at line 9315 of file data_set.cpp.

◆ calculate_average_forest_paths()

Tensor< type, 1 > calculate_average_forest_paths	(	const Tensor< Tensor< type, 2 >, 1 > &	forest,
		const Index &	tree_depth
	)		const

private

Definition at line 9269 of file data_set.cpp.

◆ calculate_average_reachability()

Tensor< type, 1 > calculate_average_reachability	(	Tensor< list< Index >, 1 > &	k_nearest_indexes,
		const Index &	k
	)		const

private

Definition at line 8935 of file data_set.cpp.

◆ calculate_bounding_boxes_neighbors()

Tensor< list< Index >, 1 > calculate_bounding_boxes_neighbors	(	const Tensor< Tensor< type, 1 >, 1 > &	tree,
		const Tensor< Index, 1 > &	leaves_indices,
		const Index &	depth,
		const Index &	k_neighbors
	)		const

private

Definition at line 8874 of file data_set.cpp.

◆ calculate_columns_box_plots()

Tensor< BoxPlot, 1 > calculate_columns_box_plots ( ) const

Returns a vector of subvectors with the values of a box and whiskers plot. The size of the vector is equal to the number of used variables. The size of the subvectors is 5 and they consist on:

Minimum
First quartile
Second quartile
Third quartile
Maximum

Definition at line 5320 of file data_set.cpp.

◆ calculate_columns_descriptives_categories()

Tensor< Descriptives, 1 > calculate_columns_descriptives_categories ( const Index & class_index ) const

Returns a matrix with the data set descriptive statistics.

Parameters

class_index Data set index number to make the descriptive statistics.

Definition at line 5645 of file data_set.cpp.

◆ calculate_columns_descriptives_negative_samples()

Tensor< Descriptives, 1 > calculate_columns_descriptives_negative_samples ( ) const

Calculate the descriptives of the samples with neagtive targets in binary classification problems.

Definition at line 5584 of file data_set.cpp.

◆ calculate_columns_descriptives_positive_samples()

Tensor< Descriptives, 1 > calculate_columns_descriptives_positive_samples ( ) const

Calculate the descriptives of the samples with positive targets in binary classification problems.

Definition at line 5525 of file data_set.cpp.

◆ calculate_columns_descriptives_selection_samples()

Tensor< Descriptives, 1 > calculate_columns_descriptives_selection_samples ( ) const

Returns a vector of vectors containing some basic descriptives of all variables on the selection The size of this vector is two. The subvectors are:

Selection data minimum.
Selection data maximum.
Selection data mean.
Selection data standard deviation.

Definition at line 5712 of file data_set.cpp.

◆ calculate_columns_descriptives_training_samples()

Tensor< Descriptives, 1 > calculate_columns_descriptives_training_samples ( ) const

Returns a vector of vectors containing some basic descriptives of all variables on the training The size of this vector is two. The subvectors are:

Training data minimum.
Training data maximum.
Training data mean.
Training data standard deviation.

Definition at line 5693 of file data_set.cpp.

◆ calculate_columns_distribution()

Tensor< Histogram, 1 > calculate_columns_distribution ( const Index & bins_number = 10 ) const

Returns the distribution of each of the columns. In the case of numeric columns, it returns a histogram, for the case of categorical columns, it returns the frequencies of each category and for the binary columns it returns the frequencies of the positives and negatives. The default number of bins is 10.

Parameters

bins_number Number of bins.

Definition at line 5201 of file data_set.cpp.

◆ calculate_cross_correlations()

Tensor< type, 3 > calculate_cross_correlations ( const Index & lags_number = 10 ) const

Calculates the cross-correlation between all the variables in the data set.

Definition at line 9424 of file data_set.cpp.

◆ calculate_distance_matrix()

Tensor< type, 2 > calculate_distance_matrix ( const Tensor< Index, 1 > & indices ) const

private

Definition at line 8727 of file data_set.cpp.

◆ calculate_euclidean_distance()

type calculate_euclidean_distance	(	const Tensor< Index, 1 > &	variables_indices,
		const Index &	sample_index,
		const Index &	other_sample_index
	)		const

private

Definition at line 8707 of file data_set.cpp.

◆ calculate_input_columns_correlations()

Tensor< Correlation, 2 > calculate_input_columns_correlations ( ) const

Calculate the correlation between each input in the data set. Returns a matrix with the correlation values between variables in the data set.

Definition at line 6019 of file data_set.cpp.

◆ calculate_input_target_columns_correlations()

Tensor< Correlation, 2 > calculate_input_target_columns_correlations ( ) const

Calculates the correlations between all outputs and all inputs. It returns a matrix with the data stored in CorrelationsResults format, where the number of rows is the input number and number of columns is the target number. Each element contains the correlation between a single input and a single target.

Definition at line 5876 of file data_set.cpp.

◆ calculate_input_variables_descriptives()

Tensor< Descriptives, 1 > calculate_input_variables_descriptives ( ) const

Returns a vector of Descriptives structures with some basic statistics of the input variables on the used This includes the minimum, maximum, mean and standard deviation. The size of this vector is the number of inputs.

Definition at line 5726 of file data_set.cpp.

◆ calculate_input_variables_maximums()

Tensor< type, 1 > calculate_input_variables_maximums ( ) const

Returns a vector containing the maximums of the input variables.

Definition at line 5784 of file data_set.cpp.

◆ calculate_input_variables_minimums()

Tensor< type, 1 > calculate_input_variables_minimums ( ) const

Returns a vector containing the minimums of the input variables.

Definition at line 5767 of file data_set.cpp.

◆ calculate_isolation_forest_outliers()

Tensor< Index, 1 > calculate_isolation_forest_outliers	(	const Index &	n_trees = `100`,
		const Index &	subs_set_samples = `256`,
		const type &	contamination = `type(0)`
	)		const

Definition at line 9288 of file data_set.cpp.

◆ calculate_k_nearest_neighbors()

Tensor< list< Index >, 1 > calculate_k_nearest_neighbors	(	const Tensor< type, 2 > &	distance_matrix,
		const Index &	k_neighbors = `20`
	)		const

private

Definition at line 8751 of file data_set.cpp.

◆ calculate_kd_tree_neighbors()

Tensor< list< Index >, 1 > calculate_kd_tree_neighbors	(	const Index &	k_neighbors = `20`,
		const Index &	min_samples_leaf = `40`
	)		const

private

Definition at line 8915 of file data_set.cpp.

◆ calculate_local_outlier_factor()

Tensor< type, 1 > calculate_local_outlier_factor	(	Tensor< list< Index >, 1 > &	k_nearest_indexes,
		const Tensor< type, 1 > &	average_reachabilities,
		const Index &	k
	)		const

private

Definition at line 8971 of file data_set.cpp.

◆ calculate_local_outlier_factor_outliers()

Tensor< Index, 1 > calculate_local_outlier_factor_outliers	(	const Index &	k_neighbors = `20`,
		const Index &	min_samples_leaf = `0`,
		const type &	contamination = `type(0)`
	)		const

Calculate the outliers from the data set using the LocalOutlierFactor method.

Parameters

k_neighbors	Used to perform a k_nearest_algorithm to find the local density. Default is 20.
min_samples_leaf	The minimum number of samples per leaf when building a KDTree. If 0, automatically decide between using brute force aproach or KDTree. If > samples_number/2, brute force aproach is performed. Default is 0.
contamination	Percentage of outliers in the data_set to be selected. If 0.0, those paterns which deviates from the mean of LOF more than 2 times are considered outlier. Default is 0.0.

Definition at line 9002 of file data_set.cpp.

◆ calculate_min_max_indices_list()

void calculate_min_max_indices_list	(	list< Index > &	elements,
		const Index &	variable_index,
		type &	min,
		type &	max
	)		const

private

Definition at line 9069 of file data_set.cpp.

◆ calculate_selection_negatives()

Index calculate_selection_negatives ( const Index & target_index ) const

Counts the number of negatives of the selected target in the selection data.

Parameters

target_index Index of the target to evaluate.

Definition at line 5433 of file data_set.cpp.

◆ calculate_selection_targets_mean()

Tensor< type, 1 > calculate_selection_targets_mean ( ) const

Returns the mean values of the target variables on the selection.

Definition at line 5843 of file data_set.cpp.

◆ calculate_target_distribution()

Tensor< Index, 1 > calculate_target_distribution ( ) const

Returns a vector containing the number of samples of each class in the data set. If the number of target variables is one then the number of classes is two. If the number of target variables is greater than one then the number of classes is equal to the number of target variables.

Todo:: Return class_distribution is wrong

Definition at line 8480 of file data_set.cpp.

◆ calculate_target_variables_descriptives()

Tensor< Descriptives, 1 > calculate_target_variables_descriptives ( ) const

Returns a vector of vectors with some basic descriptives of the target variables on all The size of this vector is four. The subvectors are:

Target variables minimum.
Target variables maximum.
Target variables mean.
Target variables standard deviation.

Definition at line 5745 of file data_set.cpp.

◆ calculate_target_variables_maximums()

Tensor< type, 1 > calculate_target_variables_maximums ( ) const

Returns a vector containing the maximums of the target variables.

Definition at line 5792 of file data_set.cpp.

◆ calculate_target_variables_minimums()

Tensor< type, 1 > calculate_target_variables_minimums ( ) const

Returns a vector containing the minimums of the target variables.

Definition at line 5775 of file data_set.cpp.

◆ calculate_testing_negatives()

Index calculate_testing_negatives ( const Index & target_index ) const

Counts the number of negatives of the selected target in the testing data.

Parameters

target_index Index of the target to evaluate.

Definition at line 5468 of file data_set.cpp.

◆ calculate_testing_target_variables_descriptives()

Tensor< Descriptives, 1 > calculate_testing_target_variables_descriptives ( ) const

Definition at line 5755 of file data_set.cpp.

◆ calculate_training_negatives()

Index calculate_training_negatives ( const Index & target_index ) const

Counts the number of negatives of the selected target in the training data.

Parameters

target_index Index of the target to evaluate.

Definition at line 5398 of file data_set.cpp.

◆ calculate_tree_path()

type calculate_tree_path	(	const Tensor< type, 2 > &	tree,
		const Index &	sample_index,
		const Index &	tree_depth
	)		const

private

Definition at line 9227 of file data_set.cpp.

◆ calculate_Tukey_outliers()

Tensor< Tensor< Index, 1 >, 1 > calculate_Tukey_outliers ( const type & cleaning_parameter = type(1.5) ) const

Calculate the outliers from the data set using the Tukey's test.

Parameters

cleaning_parameter Parameter used to detect outliers.

Definition at line 8540 of file data_set.cpp.

◆ calculate_used_negatives()

Index calculate_used_negatives ( const Index & target_index ) const

Counts the number of used negatives of the selected target.

Parameters

target_index Index of the target to evaluate.

Definition at line 5363 of file data_set.cpp.

◆ calculate_used_targets_mean()

Tensor< type, 1 > calculate_used_targets_mean ( ) const

Definition at line 5831 of file data_set.cpp.

◆ calculate_used_variables_descriptives()

Tensor< Descriptives, 1 > calculate_used_variables_descriptives ( ) const

Returns a vector of vectors containing some basic descriptives of the used variables and samples The size of this vector is four. The subvectors are:

Minimum.
Maximum.
Mean.
Standard deviation.

Definition at line 5514 of file data_set.cpp.

◆ calculate_used_variables_minimums()

Tensor< type, 1 > calculate_used_variables_minimums ( ) const

Returns a vector containing the maximum of the used variables.

Definition at line 5800 of file data_set.cpp.

◆ calculate_variables_descriptives()

Tensor< Descriptives, 1 > calculate_variables_descriptives ( ) const

Returns a vector of vectors containing some basic descriptives of all the variables in the data set. The size of this vector is four. The subvectors are:

Minimum.
Maximum.
Mean.
Standard deviation.

Definition at line 5499 of file data_set.cpp.

◆ calculate_variables_means()

Tensor< type, 1 > calculate_variables_means ( const Tensor< Index, 1 > & variables_indices ) const

Returns a vector containing the means of a set of given variables.

Parameters

variables_indices Indices of the variables.

Definition at line 5809 of file data_set.cpp.

◆ check_constant_columns()

void check_constant_columns ( )

Definition at line 3566 of file data_set.cpp.

◆ check_input_csv()

void check_input_csv	(	const string &	input_data_file_name,
		const char &	separator_char
	)		const

This method checks if the input data file has the correct format. Returns an error message.

Definition at line 8111 of file data_set.cpp.

◆ check_separators()

void check_separators ( const string & line ) const

Definition at line 10571 of file data_set.cpp.

◆ check_special_characters()

void check_special_characters ( const string & line ) const

Definition at line 10646 of file data_set.cpp.

◆ count_nan()

Index count_nan ( ) const

Definition at line 10783 of file data_set.cpp.

◆ count_nan_columns()

Tensor< Index, 1 > count_nan_columns ( ) const

Definition at line 10729 of file data_set.cpp.

◆ count_rows_with_nan()

Index count_rows_with_nan ( ) const

Definition at line 10754 of file data_set.cpp.

◆ create_bounding_limits_kd_tree()

Tensor< Tensor< Index, 1 >, 1 > create_bounding_limits_kd_tree ( const Index & depth ) const

private

Definition at line 8816 of file data_set.cpp.

◆ create_isolation_forest()

Tensor< Tensor< type, 2 >, 1 > create_isolation_forest	(	const Index &	trees_number,
		const Index &	sub_set_size,
		const Index &	max_depth
	)		const

private

Definition at line 9202 of file data_set.cpp.

◆ create_isolation_tree()

Tensor< type, 2 > create_isolation_tree	(	const Tensor< Index, 1 > &	indices,
		const Index &	max_depth
	)		const

private

Definition at line 9140 of file data_set.cpp.

◆ create_kd_tree()

void create_kd_tree	(	Tensor< Tensor< type, 1 >, 1 > &	tree,
		const Tensor< Tensor< Index, 1 >, 1 > &	bounding_limits
	)		const

private

Definition at line 8843 of file data_set.cpp.

◆ filter_data()

Tensor< Index, 1 > filter_data	(	const Tensor< type, 1 > &	minimums,
		const Tensor< type, 1 > &	maximums
	)

Unuses those samples with values outside a defined range.

Parameters

minimums	vector of minimum values in the range. The size must be equal to the number of variables.
maximums	vector of maximum values in the range. The size must be equal to the number of variables.

Todo:

Definition at line 9662 of file data_set.cpp.

◆ fix_repeated_names()

void fix_repeated_names ( )

Definition at line 10840 of file data_set.cpp.

◆ from_XML()

void from_XML ( const tinyxml2::XMLDocument & data_set_document )

Definition at line 6855 of file data_set.cpp.

◆ generate_constant_data()

void generate_constant_data	(	const Index &	samples_number,
		const Index &	variables_number,
		const type &	value
	)

Generates an artificial data_set with a given number of samples and number of variables by constant data.

Parameters

samples_number	Number of samples in the data_set.
variables_number	Number of variables in the data_set.

Definition at line 9558 of file data_set.cpp.

◆ generate_random_data()

void generate_random_data	(	const Index &	samples_number,
		const Index &	variables_number
	)

Generates an artificial data_set with a given number of samples and number of variables using random data.

Parameters

samples_number	Number of samples in the data_set.
variables_number	Number of variables in the data_set.

Todo:

Definition at line 9574 of file data_set.cpp.

◆ generate_Rosenbrock_data()

void generate_Rosenbrock_data	(	const Index &	samples_number,
		const Index &	variables_number
	)

Generates an artificial data_set with a given number of samples and number of variables using the Rosenbrock function.

Parameters

samples_number	Number of samples in the data_set.
variables_number	Number of variables in the data_set.

Todo:

Definition at line 9607 of file data_set.cpp.

◆ generate_sequential_data()

void generate_sequential_data	(	const Index &	samples_number,
		const Index &	variables_number
	)

Generates an artificial data_set with a given number of samples and number of variables using a sequential data.

Parameters

samples_number	Number of samples in the data_set.
variables_number	Number of variables in the data_set.

Definition at line 9587 of file data_set.cpp.

◆ generate_sum_data()

void generate_sum_data	(	const Index &	samples_number,
		const Index &	variables_number
	)

Definition at line 9637 of file data_set.cpp.

◆ get_batches()

Tensor< Index, 2 > get_batches	(	const Tensor< Index, 1 > &	samples_indices,
		const Index &	batch_samples_number,
		const bool &	shuffle,
		const Index &	new_buffer_size = `100`
	)		const

Returns a vector, where each element is a vector that contains the indices of the different batches of the training samples.

Parameters

shuffle Is a boleean. If shuffle is true, then the indices are shuffled into batches, and false otherwise

Definition at line 1217 of file data_set.cpp.

◆ get_column_data() [1/3]

Tensor< type, 2 > get_column_data ( const Index & column_index ) const

Returns the data from the data set column with a given index, these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).

Parameters

column_index Index of the column.

Definition at line 4289 of file data_set.cpp.

◆ get_column_data() [2/3]

Tensor< type, 2 > get_column_data	(	const Index &	column_index,
		const Tensor< Index, 1 > &	rows_indices
	)		const

Returns the data from the data set column with a given index, these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).

Parameters

column_index	Index of the column.
rows_indices	Rows of the indices.

Definition at line 4332 of file data_set.cpp.

◆ get_column_data() [3/3]

Tensor< type, 2 > get_column_data ( const string & column_name ) const

Returns the data from the data set column with a given name, these data can be stored in a matrix or a vector depending on whether the column is categorical or not(respectively).

Parameters

column_name Name of the column.

Definition at line 4342 of file data_set.cpp.

◆ get_column_index() [1/2]

Index get_column_index ( const Index & variable_index ) const

Returns the index of the column to which a variable index belongs.

Parameters

variable_index Index of the variable to be found.

Definition at line 4214 of file data_set.cpp.

◆ get_column_index() [2/2]

Index get_column_index ( const string & column_name ) const

Returns the index of the column with the given name.

Parameters

column_name Name of the column to be found.

Definition at line 4192 of file data_set.cpp.

◆ get_column_type()

ColumnType get_column_type ( const Index & index ) const

inline

Definition at line 244 of file data_set.h.

◆ get_column_use()

DataSet::VariableUse get_column_use ( const Index & index ) const

Returns a vector containing the use of the column, without taking into account the categories.

Definition at line 2000 of file data_set.cpp.

◆ get_columns()

Tensor< DataSet::Column, 1 > get_columns ( ) const

Returns the columns of the data set.

Definition at line 2697 of file data_set.cpp.

◆ get_columns_names()

Tensor< string, 1 > get_columns_names ( ) const

Returns a string vector that contains the names of the columns.

Definition at line 2473 of file data_set.cpp.

◆ get_columns_number()

Index get_columns_number ( ) const

Returns the number of columns in the data set.

Definition at line 2800 of file data_set.cpp.

◆ get_columns_scalers()

Tensor< Scaler, 1 > get_columns_scalers ( ) const

Definition at line 2408 of file data_set.cpp.

◆ get_columns_uses()

Tensor< DataSet::VariableUse, 1 > get_columns_uses ( ) const

Returns the uses of each columns of the data set.

Definition at line 2008 of file data_set.cpp.

◆ get_data()

const Tensor< type, 2 > & get_data ( ) const

Returns a reference to the data matrix in the data set. The number of rows is equal to the number of samples. The number of columns is equal to the number of variables.

Definition at line 3673 of file data_set.cpp.

◆ get_data_file_name()

const string & get_data_file_name ( ) const

Returns the name of the data file.

Definition at line 3704 of file data_set.cpp.

◆ get_data_file_preview()

Tensor< Tensor< string, 1 >, 1 > get_data_file_preview ( ) const

Definition at line 4550 of file data_set.cpp.

◆ get_data_pointer()

Tensor< type, 2 > * get_data_pointer ( )

Definition at line 3679 of file data_set.cpp.

◆ get_default_columns_names()

Tensor< string, 1 > get_default_columns_names ( const Index & columns_number )

static

Definition at line 9873 of file data_set.cpp.

◆ get_display()

const bool & get_display ( ) const

Returns true if messages from this class can be displayed on the screen, or false if messages from this class can't be displayed on the screen.

Definition at line 94 of file data_set.cpp.

◆ get_gmt()

Index get_gmt ( ) const

Returns the value of the gmt that has the data set, by default it is 0. This is recommended to use in forecasting problems.

Definition at line 5856 of file data_set.cpp.

◆ get_has_rows_labels()

bool get_has_rows_labels ( ) const

Definition at line 11149 of file data_set.cpp.

◆ get_header_line()

const bool & get_header_line ( ) const

Returns true if the first line of the data file has a header with the names of the variables, and false otherwise.

Definition at line 3712 of file data_set.cpp.

◆ get_input_columns()

Tensor< DataSet::Column, 1 > get_input_columns ( ) const

Returns the input columns of the data set.

Definition at line 2717 of file data_set.cpp.

◆ get_input_columns_binary()

Tensor< bool, 1 > get_input_columns_binary ( ) const

Returns the input columns of the data set.

Definition at line 2739 of file data_set.cpp.

◆ get_input_columns_indices()

Tensor< Index, 1 > get_input_columns_indices ( ) const

Returns a indices vector with the positions of the inputs.

Definition at line 2271 of file data_set.cpp.

◆ get_input_columns_names()

Tensor< string, 1 > get_input_columns_names ( ) const

Returns a string vector that contains the names of the columns whose uses are Input.

Definition at line 2505 of file data_set.cpp.

◆ get_input_columns_number()

Index get_input_columns_number ( ) const

Returns the number of columns whose uses are Input.

Definition at line 2575 of file data_set.cpp.

◆ get_input_data() [1/2]

Tensor< type, 2 > get_input_data ( ) const

Returns a matrix with the input variables in the data set. The number of rows is the number of The number of columns is the number of input variables.

Definition at line 3955 of file data_set.cpp.

◆ get_input_data() [2/2]

Tensor< type, 2 > get_input_data ( const Tensor< Index, 1 > & samples_indices ) const

Returns a tensor with the input variables in the data set. The number of rows is the number of The number of columns is the number of input variables.

Definition at line 3986 of file data_set.cpp.

◆ get_input_time_series_columns_indices()

Tensor< Index, 1 > get_input_time_series_columns_indices ( ) const

Definition at line 2292 of file data_set.cpp.

◆ get_input_time_series_columns_number()

Index get_input_time_series_columns_number ( ) const

Definition at line 2591 of file data_set.cpp.

◆ get_input_variables_dimensions()

const Tensor< Index, 1 > & get_input_variables_dimensions ( ) const

Returns the dimensions of the input variables.

Definition at line 2245 of file data_set.cpp.

◆ get_input_variables_indices()

Tensor< Index, 1 > get_input_variables_indices ( ) const

Returns the indices of the input variables.

Definition at line 3047 of file data_set.cpp.

◆ get_input_variables_names()

Tensor< string, 1 > get_input_variables_names ( ) const

Returns the names of the input variables in the data set. The size of the vector is the number of input variables.

Definition at line 2184 of file data_set.cpp.

◆ get_input_variables_number()

Index get_input_variables_number ( ) const

Returns the number of input variables of the data set. Note that the number of variables does not have to equal the number of columns in the data set, because OpenNN recognizes the categorical columns, separating these categories into variables of the data set.

Definition at line 2859 of file data_set.cpp.

◆ get_input_variables_rank()

Index get_input_variables_rank ( ) const

Definition at line 2251 of file data_set.cpp.

◆ get_input_variables_scalers()

Tensor< Scaler, 1 > get_input_variables_scalers ( ) const

Definition at line 2423 of file data_set.cpp.

◆ get_kd_tree_data()

Tensor< Tensor< type, 1 >, 1 > get_kd_tree_data ( ) const

private

Definition at line 8792 of file data_set.cpp.

◆ get_lags_number()

const Index & get_lags_number ( ) const

Returns the number of lags to be used in a time series prediction application.

Definition at line 3825 of file data_set.cpp.

◆ get_missing_values_label()

const string & get_missing_values_label ( ) const

Returns the string which will be used as label for the missing values in the data file.

Definition at line 3817 of file data_set.cpp.

◆ get_missing_values_method()

DataSet::MissingValuesMethod get_missing_values_method ( ) const

Returns a string with the method used.

Definition at line 3696 of file data_set.cpp.

◆ get_rows_label()

const bool & get_rows_label ( ) const

Returns true if the data file has rows label, and false otherwise.

Definition at line 3720 of file data_set.cpp.

◆ get_rows_label_tensor()

Tensor< string, 1 > get_rows_label_tensor ( ) const

Definition at line 3726 of file data_set.cpp.

◆ get_sample_data() [1/2]

Tensor< type, 1 > get_sample_data ( const Index & index ) const

Returns the inputs and target values of a single sample in the data set.

Parameters

index Index of the sample.

Definition at line 4093 of file data_set.cpp.

◆ get_sample_data() [2/2]

Tensor< type, 1 > get_sample_data	(	const Index &	sample_index,
		const Tensor< Index, 1 > &	variables_indices
	)		const

Returns the inputs and target values of a single sample in the data set.

Parameters

sample_index	Index of the sample.
variables_indices	Indices of the variables.

Definition at line 4123 of file data_set.cpp.

◆ get_sample_input_data()

Tensor< type, 2 > get_sample_input_data ( const Index & sample_index ) const

Returns the inputs values of a single sample in the data set.

Parameters

sample_index Index of the sample.

Definition at line 4163 of file data_set.cpp.

◆ get_sample_string()

string get_sample_string	(	const Index &	sample_index,
		const string &	separator = `","`
	)		const

Returns a string with the values of the sample corresponding to the given index. The values will be separated by the given separator char.

Parameters

sample_index	Index of the sample.
separator	Separator.

Definition at line 1006 of file data_set.cpp.

◆ get_sample_target_data()

Tensor< type, 2 > get_sample_target_data ( const Index & sample_index ) const

Returns the target values of a single sample in the data set.

Parameters

sample_index Index of the sample.

Definition at line 4181 of file data_set.cpp.

◆ get_sample_use()

DataSet::SampleUse get_sample_use ( const Index & index ) const

Returns the use of a single sample.

Parameters

index Sample index.

Definition at line 1199 of file data_set.cpp.

◆ get_samples_number()

Index get_samples_number ( ) const

inline

Definition at line 184 of file data_set.h.

◆ get_samples_uses()

const Tensor< DataSet::SampleUse, 1 > & get_samples_uses ( ) const

Returns the use of every sample (training, selection, testing or unused) in a vector.

Definition at line 1207 of file data_set.cpp.

◆ get_samples_uses_numbers()

Tensor< Index, 1 > get_samples_uses_numbers ( ) const

Returns a vector with the number of training, selection, testing and unused samples. The size of that vector is therefore four.

Definition at line 943 of file data_set.cpp.

◆ get_samples_uses_percentages()

Tensor< type, 1 > get_samples_uses_percentages ( ) const

Returns a vector with the uses of the samples in percentages of the data set. Uses: training, selection, testing and unused samples. Note that the vector size is four.

Definition at line 977 of file data_set.cpp.

◆ get_scaling_unscaling_method()

Scaler get_scaling_unscaling_method ( const string & scaling_unscaling_method )

static

Returns a value of the scaling-unscaling method enumeration from a string containing the name of that method.

Parameters

scaling_unscaling_method String with the name of the scaling and unscaling method.

Definition at line 3861 of file data_set.cpp.

◆ get_selection_data()

Tensor< type, 2 > get_selection_data ( ) const

Returns a matrix with the selection samples in the data set. The number of rows is the number of selection The number of columns is the number of variables.

Definition at line 3921 of file data_set.cpp.

◆ get_selection_input_data()

Tensor< type, 2 > get_selection_input_data ( ) const

Returns a tensor with selection samples and input variables. The number of rows is the number of selection The number of columns is the number of input variables.

Definition at line 4038 of file data_set.cpp.

◆ get_selection_rows_label_tensor()

Tensor< string, 1 > get_selection_rows_label_tensor ( )

Definition at line 3746 of file data_set.cpp.

◆ get_selection_samples_indices()

Tensor< Index, 1 > get_selection_samples_indices ( ) const

Returns the indices of the samples which will be used for selection.

Definition at line 1098 of file data_set.cpp.

◆ get_selection_samples_number()

Index get_selection_samples_number ( ) const

Returns the number of samples in the data set which will be used for selection.

Definition at line 1402 of file data_set.cpp.

◆ get_selection_target_data()

Tensor< type, 2 > get_selection_target_data ( ) const

Returns a tensor with selection samples and target variables. The number of rows is the number of selection The number of columns is the number of target variables.

Definition at line 4052 of file data_set.cpp.

◆ get_separator()

const DataSet::Separator & get_separator ( ) const

Returns the separator to be used in the data file.

Definition at line 3763 of file data_set.cpp.

◆ get_separator_char()

char get_separator_char ( ) const

Returns the string which will be used as separator in the data file.

Definition at line 3771 of file data_set.cpp.

◆ get_separator_string()

string get_separator_string ( ) const

Returns the string which will be used as separator in the data file.

Definition at line 3794 of file data_set.cpp.

◆ get_steps_ahead()

const Index & get_steps_ahead ( ) const

Returns the number of steps ahead to be used in a time series prediction application.

Definition at line 3833 of file data_set.cpp.

◆ get_subtensor_data()

Tensor< type, 2 > get_subtensor_data	(	const Tensor< Index, 1 > &	rows_indices,
		const Tensor< Index, 1 > &	variables_indices
	)		const

Definition at line 4556 of file data_set.cpp.

◆ get_target_columns()

Tensor< DataSet::Column, 1 > get_target_columns ( ) const

Returns the target columns of the data set.

Definition at line 2759 of file data_set.cpp.

◆ get_target_columns_indices()

Tensor< Index, 1 > get_target_columns_indices ( ) const

Returns a indices vector with the positions of the targets.

Definition at line 2315 of file data_set.cpp.

◆ get_target_columns_names()

Tensor< string, 1 > get_target_columns_names ( ) const

Returns a string vector which contains the names of the columns whose uses are Target.

Definition at line 2528 of file data_set.cpp.

◆ get_target_columns_number()

Index get_target_columns_number ( ) const

Returns the number of columns whose uses are Target.

Definition at line 2609 of file data_set.cpp.

◆ get_target_data() [1/2]

Tensor< type, 2 > get_target_data ( ) const

Returns a matrix with the target variables in the data set. The number of rows is the number of The number of columns is the number of target variables.

Definition at line 3972 of file data_set.cpp.

◆ get_target_data() [2/2]

Tensor< type, 2 > get_target_data ( const Tensor< Index, 1 > & samples_indices ) const

Returns a tensor with the target variables in the data set. The number of rows is the number of The number of columns is the number of input variables.

Definition at line 3998 of file data_set.cpp.

◆ get_target_time_series_columns_indices()

Tensor< Index, 1 > get_target_time_series_columns_indices ( ) const

Definition at line 2336 of file data_set.cpp.

◆ get_target_time_series_columns_number()

Index get_target_time_series_columns_number ( ) const

Definition at line 2625 of file data_set.cpp.

◆ get_target_variables_indices()

Tensor< Index, 1 > get_target_variables_indices ( ) const

Returns the indices of the target variables.

Definition at line 3094 of file data_set.cpp.

◆ get_target_variables_names()

Tensor< string, 1 > get_target_variables_names ( ) const

Returns the names of the target variables in the data set. The size of the vector is the number of target variables.

Definition at line 2215 of file data_set.cpp.

◆ get_target_variables_number()

Index get_target_variables_number ( ) const

Returns the number of target variables of the data set.

Definition at line 2884 of file data_set.cpp.

◆ get_target_variables_scalers()

Tensor< Scaler, 1 > get_target_variables_scalers ( ) const

Definition at line 2447 of file data_set.cpp.

◆ get_testing_data()

Tensor< type, 2 > get_testing_data ( ) const

Returns a matrix with the testing samples in the data set. The number of rows is the number of testing The number of columns is the number of variables.

Definition at line 3938 of file data_set.cpp.

◆ get_testing_input_data()

Tensor< type, 2 > get_testing_input_data ( ) const

Returns a tensor with testing samples and input variables. The number of rows is the number of testing The number of columns is the number of input variables.

Definition at line 4066 of file data_set.cpp.

◆ get_testing_rows_label_tensor()

Tensor< string, 1 > get_testing_rows_label_tensor ( )

Definition at line 3731 of file data_set.cpp.

◆ get_testing_samples_indices()

Tensor< Index, 1 > get_testing_samples_indices ( ) const

Returns the indices of the samples which will be used for testing.

Definition at line 1123 of file data_set.cpp.

◆ get_testing_samples_number()

Index get_testing_samples_number ( ) const

Returns the number of samples in the data set which will be used for testing.

Definition at line 1422 of file data_set.cpp.

◆ get_testing_target_data()

Tensor< type, 2 > get_testing_target_data ( ) const

Returns a tensor with testing samples and target variables. The number of rows is the number of testing The number of columns is the number of target variables.

Definition at line 4080 of file data_set.cpp.

◆ get_time_column()

const string & get_time_column ( ) const

Returns the indices of the time variables in the data set.

Definition at line 3841 of file data_set.cpp.

◆ get_time_columns_number()

Index get_time_columns_number ( ) const

Returns the number of columns whose uses are Time.

Definition at line 2643 of file data_set.cpp.

◆ get_time_series_column_data()

Tensor< type, 2 > get_time_series_column_data ( const Index & column_index ) const

Returns the data from the time series column with a given index,

Parameters

column_index Index of the column.

Definition at line 4309 of file data_set.cpp.

◆ get_time_series_columns()

Tensor< DataSet::Column, 1 > get_time_series_columns ( ) const

Definition at line 2703 of file data_set.cpp.

◆ get_time_series_columns_names()

Tensor< string, 1 > get_time_series_columns_names ( ) const

Definition at line 2488 of file data_set.cpp.

◆ get_time_series_columns_number()

Index get_time_series_columns_number ( ) const

Returns the number of columns in the time series.

Definition at line 2807 of file data_set.cpp.

◆ get_time_series_data()

const Tensor< type, 2 > & get_time_series_data ( ) const

Returns a reference to the time series data matrix in the data set. Only for time series problems.

Definition at line 3688 of file data_set.cpp.

◆ get_time_series_data_rows_number()

Index get_time_series_data_rows_number ( ) const

Definition at line 2709 of file data_set.cpp.

◆ get_time_series_time_column_index()

Index get_time_series_time_column_index ( ) const

Definition at line 3847 of file data_set.cpp.

◆ get_time_series_variables_names()

Tensor< string, 1 > get_time_series_variables_names ( ) const

Returns a string vector with the names of all the variables in the time series data. The size of the vector is the number of variables.

Definition at line 2151 of file data_set.cpp.

◆ get_time_series_variables_number()

Index get_time_series_variables_number ( ) const

Returns the number of variables in the time series data.

Definition at line 2835 of file data_set.cpp.

◆ get_training_data()

Tensor< type, 2 > get_training_data ( ) const

Returns a matrix with the training samples in the data set. The number of rows is the number of training The number of columns is the number of variables.

Definition at line 3900 of file data_set.cpp.

◆ get_training_input_data()

Tensor< type, 2 > get_training_input_data ( ) const

Returns a matrix with training samples and input variables. The number of rows is the number of training The number of columns is the number of input variables.

Definition at line 4010 of file data_set.cpp.

◆ get_training_samples_indices()

Tensor< Index, 1 > get_training_samples_indices ( ) const

Returns the indices of the samples which will be used for training.

Definition at line 1073 of file data_set.cpp.

◆ get_training_samples_number()

Index get_training_samples_number ( ) const

Returns the number of samples in the data set which will be used for training.

Definition at line 1382 of file data_set.cpp.

◆ get_training_target_data()

Tensor< type, 2 > get_training_target_data ( ) const

Returns a tensor with training samples and target variables. The number of rows is the number of training The number of columns is the number of target variables.

Definition at line 4024 of file data_set.cpp.

◆ get_unused_columns_indices()

Tensor< Index, 1 > get_unused_columns_indices ( ) const

Returns a indices vector with the positions of the unused columns.

Definition at line 2359 of file data_set.cpp.

◆ get_unused_columns_number()

Index get_unused_columns_number ( ) const

Returns the number of columns that are not used.

Definition at line 2661 of file data_set.cpp.

◆ get_unused_samples_indices()

Tensor< Index, 1 > get_unused_samples_indices ( ) const

Returns the indices of the samples set unused.

Definition at line 1173 of file data_set.cpp.

◆ get_unused_samples_number()

Index get_unused_samples_number ( ) const

Returns the number of samples in the data set which will neither be used for training, selection or testing.

Definition at line 1455 of file data_set.cpp.

◆ get_unused_variables_indices()

Tensor< Index, 1 > get_unused_variables_indices ( ) const

Returns the indices of the unused variables.

Definition at line 2956 of file data_set.cpp.

◆ get_unused_variables_number()

Index get_unused_variables_number ( ) const

Returns the number of variables which will neither be used as input nor as target.

Definition at line 2910 of file data_set.cpp.

◆ get_used_columns()

Tensor< DataSet::Column, 1 > get_used_columns ( ) const

Returns the used columns of the data set.

Definition at line 2781 of file data_set.cpp.

◆ get_used_columns_indices()

Tensor< Index, 1 > get_used_columns_indices ( ) const

Returns a indices vector with the positions of the used columns.

Definition at line 2383 of file data_set.cpp.

◆ get_used_columns_names()

Tensor< string, 1 > get_used_columns_names ( ) const

Returns a string vector which contains the names of the columns used whether Input, Target or Time.

Definition at line 2551 of file data_set.cpp.

◆ get_used_columns_number()

Index get_used_columns_number ( ) const

Returns the number of columns that are used.

Definition at line 2679 of file data_set.cpp.

◆ get_used_samples_indices()

Tensor< Index, 1 > get_used_samples_indices ( ) const

Returns the indices of the used samples(those which are not set unused).

Definition at line 1148 of file data_set.cpp.

◆ get_used_samples_number()

Index get_used_samples_number ( ) const

Returns the total number of training, selection and testing samples, i.e. those which are not "Unused".

Definition at line 1443 of file data_set.cpp.

◆ get_used_variables_indices()

Tensor< Index, 1 > get_used_variables_indices ( ) const

Returns the indices of the used variables.

Definition at line 3002 of file data_set.cpp.

◆ get_used_variables_number()

Index get_used_variables_number ( ) const

Returns the number of variables which are either input nor target.

Definition at line 2259 of file data_set.cpp.

◆ get_variable_data() [1/4]

Tensor< type, 1 > get_variable_data ( const Index & index ) const

Returns all the samples of a single variable in the data set.

Parameters

index Index of the variable.

Definition at line 4353 of file data_set.cpp.

◆ get_variable_data() [2/4]

Tensor< type, 1 > get_variable_data	(	const Index &	variable_index,
		const Tensor< Index, 1 > &	samples_indices
	)		const

Returns a given set of samples of a single variable in the data set.

Parameters

variable_index	Index of the variable.
samples_indices	Indices of the

Definition at line 4442 of file data_set.cpp.

◆ get_variable_data() [3/4]

Tensor< type, 1 > get_variable_data ( const string & variable_name ) const

Returns all the samples of a single variable in the data set.

Parameters

variable_name Name of the variable.

Definition at line 4380 of file data_set.cpp.

◆ get_variable_data() [4/4]

Tensor< type, 1 > get_variable_data	(	const string &	variable_name,
		const Tensor< Index, 1 > &	samples_indices
	)		const

Returns a given set of samples of a single variable in the data set.

Parameters

variable_name	Name of the variable.
samples_indices	Indices of the

Definition at line 4481 of file data_set.cpp.

◆ get_variable_index()

Index get_variable_index ( const string & name ) const

Returns a variable index in the data set with given name.

Parameters

name	Name of variable.

Definition at line 2937 of file data_set.cpp.

◆ get_variable_indices()

Tensor< Index, 1 > get_variable_indices ( const Index & column_index ) const

Returns the indices of a variable in the data set. Note that the number of variables does not have to equal the number of columns in the data set, because OpenNN recognizes the categorical columns, separating these categories into variables of the data set.

Definition at line 4248 of file data_set.cpp.

◆ get_variable_name()

string get_variable_name ( const Index & variable_index ) const

Returns the name of a single variable in the data set.

Parameters

index Index of variable.

Definition at line 2059 of file data_set.cpp.

◆ get_variable_use()

DataSet::VariableUse get_variable_use ( const Index & index ) const

Returns the use of a single variable.

Parameters

index Index of variable.

Definition at line 1992 of file data_set.cpp.

◆ get_variables_names()

Tensor< string, 1 > get_variables_names ( ) const

Returns a string vector with the names of all the variables in the data set. The size of the vector is the number of variables.

Definition at line 2118 of file data_set.cpp.

◆ get_variables_number()

Index get_variables_number ( ) const

Returns the number of variables in the data set.

Definition at line 2814 of file data_set.cpp.

◆ get_variables_uses()

Tensor< DataSet::VariableUse, 1 > get_variables_uses ( ) const

Returns a vector containing the use of each column, including the categories. The size of the vector is equal to the number of variables.

Definition at line 2026 of file data_set.cpp.

◆ has_binary_columns()

bool has_binary_columns ( ) const

Definition at line 10668 of file data_set.cpp.

◆ has_categorical_columns()

bool has_categorical_columns ( ) const

Definition at line 10681 of file data_set.cpp.

◆ has_nan()

bool has_nan ( ) const

Returns true if the data contain missing values.

Definition at line 5912 of file data_set.cpp.

◆ has_nan_row()

bool has_nan_row ( const Index & row_index ) const

Returns true if the given row contains missing values.

Definition at line 5922 of file data_set.cpp.

◆ has_selection()

bool has_selection ( ) const

Definition at line 10721 of file data_set.cpp.

◆ has_time_columns()

bool has_time_columns ( ) const

Definition at line 10694 of file data_set.cpp.

◆ has_time_time_series_columns()

bool has_time_time_series_columns ( ) const

Definition at line 10707 of file data_set.cpp.

◆ impute_missing_values_mean()

void impute_missing_values_mean ( )

Substitutes all the missing values by the mean of the corresponding variable.

Definition at line 9768 of file data_set.cpp.

◆ impute_missing_values_median()

void impute_missing_values_median ( )

Substitutes all the missing values by the median of the corresponding variable.

Definition at line 9802 of file data_set.cpp.

◆ impute_missing_values_unuse()

void impute_missing_values_unuse ( )

Sets all the samples with missing values to "Unused".

Definition at line 9753 of file data_set.cpp.

◆ initialize_sequential()

void initialize_sequential	(	Tensor< Index, 1 > &	new_tensor,
		const Index &	start,
		const Index &	step,
		const Index &	end
	)		const

Definition at line 10947 of file data_set.cpp.

◆ intialize_sequential()

void intialize_sequential	(	Tensor< type, 1 > &	new_tensor,
		const type &	start,
		const type &	step,
		const type &	end
	)		const

Definition at line 10964 of file data_set.cpp.

◆ is_empty()

bool is_empty ( ) const

Returns true if the data matrix is empty, and false otherwise.

Definition at line 3658 of file data_set.cpp.

◆ is_sample_unused()

bool is_sample_unused ( const Index & index ) const

Returns true if a given sample is to be unused and false in other case.

Parameters

index Sample index.

Definition at line 926 of file data_set.cpp.

◆ is_sample_used()

bool is_sample_used ( const Index & index ) const

Returns true if a given sample is to be used for training, selection or testing, and false if it is to be unused.

Parameters

index Sample index.

Definition at line 910 of file data_set.cpp.

◆ load()

void load ( const string & file_name )

Loads the members of a data set object from a XML-type file:

Samples number.
Training samples number.
Training samples indices.
Selection samples number.
Selection samples indices.
Testing samples number.
Testing samples indices.
Input variables number.
Input variables indices.
Target variables number.
Target variables indices.
Input variables name.
Target variables name.
Input variables description.
Target variables description.
Display.
Data.

Please mind about the file format. This is specified in the User's Guide.

Parameters

file_name Name of data set XML-type file.

Definition at line 7716 of file data_set.cpp.

◆ load_data_binary()

void load_data_binary ( )

This method loads the data from a binary data file.

Definition at line 8023 of file data_set.cpp.

◆ load_time_series_data_binary()

void load_time_series_data_binary ( const string & time_series_data_file_name )

This method loads time series data from a binary data.

Definition at line 8067 of file data_set.cpp.

◆ print()

void print ( ) const

Prints to the screen in text format the main numbers from the data set object.

Definition at line 7664 of file data_set.cpp.

◆ print_columns()

void print_columns ( ) const

Definition at line 7735 of file data_set.cpp.

◆ print_columns_types()

void print_columns_types ( ) const

Definition at line 7749 of file data_set.cpp.

◆ print_columns_uses()

void print_columns_uses ( ) const

Definition at line 7766 of file data_set.cpp.

◆ print_data()

void print_data ( ) const

Prints to the screen the values of the data matrix.

Definition at line 7783 of file data_set.cpp.

◆ print_data_file_preview()

void print_data_file_preview ( ) const

Definition at line 6072 of file data_set.cpp.

◆ print_data_preview()

void print_data_preview ( ) const

Prints to the scross_entropy_errorn a preview of the data matrix, i.e., the first, second and last samples

Definition at line 7792 of file data_set.cpp.

◆ print_input_target_columns_correlations()

void print_input_target_columns_correlations ( ) const

Print on screen the correlation between targets and inputs.

Definition at line 5960 of file data_set.cpp.

◆ print_inputs_correlations()

void print_inputs_correlations ( ) const

Print on screen the correlation between variables in the data set.

Definition at line 6064 of file data_set.cpp.

◆ print_missing_values_information()

void print_missing_values_information ( ) const

Print on screen the information about the missing values in the data set.

Total number of missing values.
Number of variables with missing values.
Number of samples with missing values.

Definition at line 5940 of file data_set.cpp.

◆ print_top_input_target_columns_correlations()

void print_top_input_target_columns_correlations ( ) const

This method print on screen the corretaliont between inputs and targets.

Parameters

number Number of variables to be printed.

Definition at line 5983 of file data_set.cpp.

◆ print_top_inputs_correlations()

void print_top_inputs_correlations ( ) const

This method print on screen the corretaliont between variables.

Parameters

number Number of variables to be printed.

Definition at line 6091 of file data_set.cpp.

◆ push_back() [1/2]

Tensor< Index, 1 > push_back	(	const Tensor< Index, 1 > &	old_vector,
		const Index &	new_string
	)		const

Definition at line 10915 of file data_set.cpp.

◆ push_back() [2/2]

Tensor< string, 1 > push_back	(	const Tensor< string, 1 > &	old_vector,
		const string &	new_string
	)		const

Definition at line 10931 of file data_set.cpp.

◆ read_csv()

void read_csv ( )

Definition at line 9853 of file data_set.cpp.

◆ read_csv_1()

void read_csv_1 ( )

Definition at line 9890 of file data_set.cpp.

◆ read_csv_2_complete()

void read_csv_2_complete ( )

Definition at line 10253 of file data_set.cpp.

◆ read_csv_2_simple()

void read_csv_2_simple ( )

Definition at line 10050 of file data_set.cpp.

◆ read_csv_3_complete()

void read_csv_3_complete ( )

Definition at line 10395 of file data_set.cpp.

◆ read_csv_3_simple()

void read_csv_3_simple ( )

Definition at line 10140 of file data_set.cpp.

◆ read_input_csv()

Tensor< type, 2 > read_input_csv	(	const string &	input_data_file_name,
		const char &	separator_char,
		const string &	missing_values_label,
		const bool &	has_columns_name,
		const bool &	has_rows_label
	)		const

This method loads data from a file and returns a matrix containing the input columns.

Definition at line 8182 of file data_set.cpp.

◆ save()

void save ( const string & file_name ) const

Saves the members of a data set object to a XML-type file in an XML-type format.

Parameters

file_name Name of data set XML-type file.

Definition at line 7681 of file data_set.cpp.

◆ save_data()

void save_data ( ) const

Saves to the data file the values of the data matrix.

Definition at line 7844 of file data_set.cpp.

◆ save_data_binary()

void save_data_binary ( const string & binary_data_file_name ) const

Saves to the data file the values of the data matrix in binary format.

Definition at line 7909 of file data_set.cpp.

◆ save_time_series_data_binary()

void save_time_series_data_binary ( const string & binary_data_file_name ) const

Saves to the data file the values of the time series data matrix in binary format.

Definition at line 7958 of file data_set.cpp.

◆ scale_data()

Tensor< Descriptives, 1 > scale_data ( )

Definition at line 6146 of file data_set.cpp.

◆ scale_input_variables()

Tensor< Descriptives, 1 > scale_input_variables ( )

It scales every input variable with the given method. The method to be used is that in the scaling and unscaling method variable.

Definition at line 6243 of file data_set.cpp.

◆ scale_target_variables()

Tensor< Descriptives, 1 > scale_target_variables ( )

Calculates the input and target variables descriptives. Then it scales the target variables with those values. The method to be used is that in the scaling and unscaling method variable. Finally, it returns the descriptives.

Definition at line 6298 of file data_set.cpp.

◆ scrub_missing_values()

void scrub_missing_values ( )

General method for dealing with missing values. It switches among the different scrubbing methods available, according to the corresponding value in the missing values object.

Definition at line 9828 of file data_set.cpp.

◆ select_outliers_via_contamination()

Tensor< Index, 1 > select_outliers_via_contamination	(	const Tensor< type, 1 > &	outlier_ranks,
		const type &	contamination = `type(0.05)`,
		bool	higher = `true`
	)		const

private

Definition at line 8634 of file data_set.cpp.

◆ select_outliers_via_standard_deviation()

Tensor< Index, 1 > select_outliers_via_standard_deviation	(	const Tensor< type, 1 > &	outlier_ranks,
		const type &	deviation_factor = `type(2.0)`,
		bool	higher = `true`
	)		const

private

Definition at line 8673 of file data_set.cpp.

◆ set() [1/8]

void set ( )

Sets zero samples and zero variables in the data set.

Definition at line 4586 of file data_set.cpp.

◆ set() [2/8]

void set ( const DataSet & other_data_set )

Sets the members of this data set object with those from another data set object.

Parameters

other_data_set Data set object to be copied.

Definition at line 4743 of file data_set.cpp.

◆ set() [3/8]

void set	(	const Index &	new_samples_number,
		const Index &	new_variables_number
	)

Sets new numbers of samples and variables in the inputs targets data set. All the samples are set for training. All the variables are set as inputs.

Parameters

new_samples_number	Number of
new_variables_number	Number of variables.

Definition at line 4649 of file data_set.cpp.

◆ set() [4/8]

void set	(	const Index &	new_samples_number,
		const Index &	new_inputs_number,
		const Index &	new_targets_number
	)

Sets new numbers of samples and inputs and target variables in the data set. The variables in the data set are the number of inputs plus the number of targets.

Parameters

new_samples_number	Number of
new_inputs_number	Number of input variables.
new_targets_number	Number of target variables.

Definition at line 4703 of file data_set.cpp.

◆ set() [5/8]

void set ( const string & file_name )

Sets the data set members by loading them from a XML file.

Parameters

file_name Data set XML file_name.

Definition at line 4775 of file data_set.cpp.

◆ set() [6/8]

void set	(	const string &	data_file_name,
		const char &	separator,
		const bool &	new_has_columns_names
	)

Definition at line 4606 of file data_set.cpp.

◆ set() [7/8]

void set ( const Tensor< type, 2 > & new_data )

Sets all variables from a data matrix.

Parameters

new_data Data matrix.

Definition at line 4628 of file data_set.cpp.

◆ set() [8/8]

void set ( const tinyxml2::XMLDocument & data_set_document )

Sets the data set members from a XML document.

Parameters

data_set_document TinyXML document containing the member data.

Definition at line 4764 of file data_set.cpp.

◆ set_binary_simple_columns()

void set_binary_simple_columns ( )

Definition at line 3483 of file data_set.cpp.

◆ set_column_name()

void set_column_name	(	const Index &	column_index,
		const string &	new_name
	)

Sets the name of a single column.

Parameters

index	Index of column.
new_use	Use for that column.

Definition at line 1983 of file data_set.cpp.

◆ set_column_type() [1/2]

void set_column_type	(	const Index &	index,
		const ColumnType &	new_type
	)

Definition at line 3277 of file data_set.cpp.

◆ set_column_type() [2/2]

void set_column_type	(	const string &	name,
		const ColumnType &	new_type
	)

Definition at line 3283 of file data_set.cpp.

◆ set_column_use() [1/2]

void set_column_use	(	const Index &	index,
		const VariableUse &	new_use
	)

Sets the use of a single column.

Parameters

index	Index of column.
new_use	Use for that column.

Definition at line 3255 of file data_set.cpp.

◆ set_column_use() [2/2]

void set_column_use	(	const string &	name,
		const VariableUse &	new_use
	)

Sets the use of a single column.

Parameters

name	Name of column.
new_use	Use for that column.

Definition at line 3270 of file data_set.cpp.

◆ set_columns()

void set_columns ( const Tensor< Column, 1 > & new_columns )

Definition at line 1912 of file data_set.cpp.

◆ set_columns_missing_values_number() [1/2]

void set_columns_missing_values_number ( )

Definition at line 10822 of file data_set.cpp.

◆ set_columns_missing_values_number() [2/2]

void set_columns_missing_values_number ( const Tensor< Index, 1 > & new_columns_missing_values_number )

Definition at line 10816 of file data_set.cpp.

◆ set_columns_names()

void set_columns_names ( const Tensor< string, 1 > & new_names )

Sets new names for the columns in the data set from a vector of strings. The size of that vector must be equal to the total number of variables.

Parameters

new_names Name of variables.

Definition at line 3403 of file data_set.cpp.

◆ set_columns_number()

void set_columns_number ( const Index & new_columns_number )

Sets a new number of variables in the variables object. All variables are set as inputs but the last one, which is set as targets.

Parameters

new_columns_number Number of variables.

Definition at line 3465 of file data_set.cpp.

◆ set_columns_scalers()

void set_columns_scalers ( const Scaler & scalers )

Definition at line 3473 of file data_set.cpp.

◆ set_columns_unused()

void set_columns_unused ( )

Sets all columns in the data_set as unused columns.

Definition at line 3200 of file data_set.cpp.

◆ set_columns_uses() [1/2]

void set_columns_uses ( const Tensor< string, 1 > & new_columns_uses )

Sets the uses of the data set columns.

Parameters

new_columns_uses String vector that contains the new uses to be set, note that this vector needs to be the size of the number of columns in the data set.

Definition at line 3142 of file data_set.cpp.

◆ set_columns_uses() [2/2]

void set_columns_uses ( const Tensor< VariableUse, 1 > & new_columns_uses )

Sets the uses of the data set columns.

Parameters

new_columns_uses DataSet::VariableUse vector that contains the new uses to be set, note that this vector needs to be the size of the number of columns in the data set.

Definition at line 3173 of file data_set.cpp.

◆ set_data()

void set_data ( const Tensor< type, 2 > & new_data )

Sets a new data matrix. The number of rows must be equal to the number of The number of columns must be equal to the number of variables. Indices of all training, selection and testing samples and inputs and target variables do not change.

Parameters

new_data Data matrix.

Definition at line 4831 of file data_set.cpp.

◆ set_data_binary_random()

void set_data_binary_random ( )

Initializes the data matrix with random values chosen from a uniform distribution with given minimum and maximum. The targets will be binary randoms.

Definition at line 6458 of file data_set.cpp.

◆ set_data_constant()

void set_data_constant ( const type & new_value )

Initializes the data matrix with a given value.

Parameters

new_value Initialization value.

Definition at line 6440 of file data_set.cpp.

◆ set_data_file_name()

void set_data_file_name ( const string & new_data_file_name )

Sets the name of the data file. It also loads the data from that file. Moreover, it sets the variables and samples objects.

Parameters

new_data_file_name Name of the file containing the data.

Definition at line 4858 of file data_set.cpp.

◆ set_data_random()

void set_data_random ( )

Initializes the data matrix with random values chosen from a uniform distribution with given minimum and maximum.

Definition at line 6449 of file data_set.cpp.

◆ set_default()

void set_default ( )

Sets the default member values:

Display: True.

Definition at line 4796 of file data_set.cpp.

◆ set_default_columns_names()

void set_default_columns_names ( )

This method puts the names of the columns in the data_set. This is used when the data_set does not have a header, the default names are: column_0, column_1, ..., column_n.

Definition at line 1968 of file data_set.cpp.

◆ set_default_columns_scalers()

void set_default_columns_scalers ( )

Returns a vector of strings containing the scaling method that best fits each of the input variables.

Todo:: Takes too long in big files.

Definition at line 6128 of file data_set.cpp.

◆ set_default_columns_uses()

void set_default_columns_uses ( )

This method sets the n columns of the data_set by default, i.e. until column n-1 are Input and column n is Target.

Definition at line 1921 of file data_set.cpp.

◆ set_display()

void set_display ( const bool & new_display )

Sets a new display value. If it is set to true messages from this class are to be displayed on the screen; if it is set to false messages from this class are not to be displayed on the screen.

Parameters

new_display Display value.

Definition at line 4785 of file data_set.cpp.

◆ set_gmt()

void set_gmt ( Index & new_gmt )

Sets the value of the gmt, by default it is 0. This is recommended to use in forecasting problems.

Definition at line 5865 of file data_set.cpp.

◆ set_has_columns_names()

void set_has_columns_names ( const bool & new_has_columns_names )

Sets if the data file contains a header with the names of the columns.

Definition at line 4866 of file data_set.cpp.

◆ set_has_rows_label()

void set_has_rows_label ( const bool & new_has_rows_label )

Sets if the data file contains rows label.

Definition at line 4874 of file data_set.cpp.

◆ set_input()

void set_input ( )

Sets all the variables in the data set as input variables.

Definition at line 3428 of file data_set.cpp.

◆ set_input_columns()

void set_input_columns	(	const Tensor< Index, 1 > &	input_columns_indices,
		const Tensor< bool, 1 > &	input_columns_use
	)

Definition at line 3241 of file data_set.cpp.

◆ set_input_columns_unused()

void set_input_columns_unused ( )

Sets all input columns in the data_set as unused columns.

Definition at line 3229 of file data_set.cpp.

◆ set_input_target_columns()

void set_input_target_columns	(	const Tensor< Index, 1 > &	input_columns,
		const Tensor< Index, 1 > &	target_columns
	)

Definition at line 3211 of file data_set.cpp.

◆ set_input_variables_dimensions()

void set_input_variables_dimensions ( const Tensor< Index, 1 > & new_inputs_dimensions )

Sets new input dimensions in the data set.

Definition at line 3650 of file data_set.cpp.

◆ set_lags_number()

void set_lags_number ( const Index & new_lags_number )

Sets a new number of lags to be defined for a time series prediction application. When loading the data file, the time series data will be modified according to this number.

Parameters

new_lags_number Number of lags(x-1, ..., x-l) to be used.

Definition at line 5022 of file data_set.cpp.

◆ set_missing_values_label()

void set_missing_values_label ( const string & new_missing_values_label )

Sets a new label for the missing values.

Parameters

new_missing_values_label Label for the missing values.

Definition at line 4961 of file data_set.cpp.

◆ set_missing_values_method() [1/2]

void set_missing_values_method ( const MissingValuesMethod & new_missing_values_method )

Sets a new method for the missing values.

Parameters

new_missing_values_method Method for the missing values.

Definition at line 4985 of file data_set.cpp.

◆ set_missing_values_method() [2/2]

void set_missing_values_method ( const string & new_missing_values_method )

Definition at line 4991 of file data_set.cpp.

◆ set_missing_values_number() [1/2]

void set_missing_values_number ( )

Definition at line 10810 of file data_set.cpp.

◆ set_missing_values_number() [2/2]

void set_missing_values_number ( const Index & new_missing_values_number )

Definition at line 10804 of file data_set.cpp.

◆ set_rows_missing_values_number() [1/2]

void set_rows_missing_values_number ( )

Definition at line 10834 of file data_set.cpp.

◆ set_rows_missing_values_number() [2/2]

void set_rows_missing_values_number ( const Index & new_rows_missing_values_number )

Definition at line 10828 of file data_set.cpp.

◆ set_sample_use() [1/2]

void set_sample_use	(	const Index &	index,
		const SampleUse &	new_use
	)

Sets the use of a single sample.

Parameters

index	Index of sample.
new_use	Use for that sample.

Definition at line 1591 of file data_set.cpp.

◆ set_sample_use() [2/2]

void set_sample_use	(	const Index &	index,
		const string &	new_use
	)

Sets the use of a single sample from a string.

Parameters

index	Index of sample.
new_use	String with the use name("Training", "Selection", "Testing" or "Unused")

Definition at line 1602 of file data_set.cpp.

◆ set_samples_number()

void set_samples_number ( const Index & new_samples_number )

Sets a new number of samples in the data set. All samples are also set for training. The indices of the inputs and target variables do not change.

Parameters

new_samples_number Number of samples.

Definition at line 5062 of file data_set.cpp.

◆ set_samples_unused() [1/2]

void set_samples_unused ( )

Sets all the samples in the data set for unused.

Definition at line 1562 of file data_set.cpp.

◆ set_samples_unused() [2/2]

void set_samples_unused ( const Tensor< Index, 1 > & indices )

Sets samples with given indices in the data set for unused.

Parameters

indices Indices vector with the index of samples in the data set for unused.

Definition at line 1576 of file data_set.cpp.

◆ set_samples_uses() [1/2]

void set_samples_uses ( const Tensor< SampleUse, 1 > & new_uses )

Sets new uses to all the samples from a single vector.

Parameters

new_uses vector of use structures. The size of given vector must be equal to the number of samples.

Definition at line 1637 of file data_set.cpp.

◆ set_samples_uses() [2/2]

void set_samples_uses ( const Tensor< string, 1 > & new_uses )

Sets new uses to all the samples from a single vector of strings.

Parameters

new_uses vector of use strings. Possible values for the elements are "Training", "Selection", "Testing" and "Unused". The size of given vector must be equal to the number of samples.

Definition at line 1670 of file data_set.cpp.

◆ set_selection() [1/2]

void set_selection ( )

Sets all the samples in the data set for selection.

Definition at line 1488 of file data_set.cpp.

◆ set_selection() [2/2]

void set_selection ( const Tensor< Index, 1 > & indices )

Sets samples with given indices in the data set for selection.

Parameters

indices Indices vector with the index of samples in the data set for selection.

Definition at line 1531 of file data_set.cpp.

◆ set_separator() [1/3]

void set_separator ( const char & new_separator )

Sets a new separator from a char.

Parameters

new_separator Char with the separator value.

Definition at line 4892 of file data_set.cpp.

◆ set_separator() [2/3]

void set_separator ( const Separator & new_separator )

Sets a new separator.

Parameters

new_separator Separator value.

Definition at line 4883 of file data_set.cpp.

◆ set_separator() [3/3]

void set_separator ( const string & new_separator_string )

Sets a new separator from a string.

Parameters

new_separator Char with the separator value.

Definition at line 4926 of file data_set.cpp.

◆ set_steps_ahead_number()

void set_steps_ahead_number ( const Index & new_steps_ahead_number )

Sets a new number of steps ahead to be defined for a time series prediction application. When loading the data file, the time series data will be modified according to this number.

Parameters

new_steps_ahead_number Number of steps ahead to be used.

Definition at line 5032 of file data_set.cpp.

◆ set_target()

void set_target ( )

Sets all the variables in the data set as target variables.

Definition at line 3441 of file data_set.cpp.

◆ set_testing() [1/2]

void set_testing ( )

Sets all the samples in the data set for testing.

Definition at line 1501 of file data_set.cpp.

◆ set_testing() [2/2]

void set_testing ( const Tensor< Index, 1 > & indices )

Sets samples with given indices in the data set for testing.

Parameters

indices Indices vector with the index of samples in the data set for testing.

Definition at line 1547 of file data_set.cpp.

◆ set_threads_number()

void set_threads_number ( const int & new_threads_number )

Definition at line 5047 of file data_set.cpp.

◆ set_time_column()

void set_time_column ( const string & new_time_column )

Sets the new position where the time data is located in the data set.

Parameters

new_time_index Position where the time data is located.

Definition at line 5041 of file data_set.cpp.

◆ set_time_series_columns_number()

void set_time_series_columns_number ( const Index & new_variables_number )

Definition at line 4847 of file data_set.cpp.

◆ set_time_series_data()

void set_time_series_data ( const Tensor< type, 2 > & new_data )

Definition at line 4841 of file data_set.cpp.

◆ set_training() [1/2]

void set_training ( )

Sets all the samples in the data set for training.

Definition at line 1475 of file data_set.cpp.

◆ set_training() [2/2]

void set_training ( const Tensor< Index, 1 > & indices )

Sets samples with given indices in the data set for training.

Parameters

indices Indices vector with the index of samples in the data set for training.

Definition at line 1515 of file data_set.cpp.

◆ set_variable_name()

void set_variable_name	(	const Index &	variable_index,
		const string &	new_variable_name
	)

This method set the name of a single variable.

Parameters

index	Index of variable.
new_name	Name of variable.

Definition at line 3295 of file data_set.cpp.

◆ set_variables_names()

void set_variables_names ( const Tensor< string, 1 > & new_variables_names )

Sets new names for the variables in the data set from a vector of strings. The size of that vector must be equal to the total number of variables.

Parameters

new_names Name of variables.

Definition at line 3355 of file data_set.cpp.

◆ set_variables_unused()

void set_variables_unused ( )

Sets all the variables in the data set as unused variables.

Definition at line 3452 of file data_set.cpp.

◆ shuffle()

void shuffle ( )

Definition at line 11113 of file data_set.cpp.

◆ split_isolation_tree()

Index split_isolation_tree	(	Tensor< type, 2 > &	tree,
		list< list< Index > > &	tree_simulation,
		list< Index > &	tree_index
	)		const

private

Definition at line 9082 of file data_set.cpp.

◆ split_samples()

Tensor< Index, 2 > split_samples	(	const Tensor< Index, 1 > &	samples_indices,
		const Index &	new_batch_size
	)		const

Definition at line 10981 of file data_set.cpp.

◆ split_samples_random()

void split_samples_random	(	const type &	training_samples_ratio = `static_cast<type>(0.6)`,
		const type &	selection_samples_ratio = `static_cast<type>(0.2)`,
		const type &	testing_samples_ratio = `static_cast<type>(0.2)`
	)

Creates new training, selection and testing indices at random.

Parameters

training_samples_ratio	Ratio of training samples in the data set.
selection_samples_ratio	Ratio of selection samples in the data set.
testing_samples_ratio	Ratio of testing samples in the data set.

Definition at line 1726 of file data_set.cpp.

◆ split_samples_sequential()

void split_samples_sequential	(	const type &	training_samples_ratio = `static_cast<type>(0.6)`,
		const type &	selection_samples_ratio = `static_cast<type>(0.2)`,
		const type &	testing_samples_ratio = `static_cast<type>(0.2)`
	)

Creates new training, selection and testing indices with sequential indices.

Parameters

training_samples_ratio	Ratio of training samples in the data set.
selection_samples_ratio	Ratio of selection samples in the data set.
testing_samples_ratio	Ratio of testing samples in the data set.

Definition at line 1834 of file data_set.cpp.

◆ transform_binary_column()

Tensor< type, 2 > transform_binary_column ( const Tensor< type, 1 > & column ) const

Definition at line 3620 of file data_set.cpp.

◆ transform_time_series()

void transform_time_series ( )

Arranges an input-target DataSet from a time series matrix, according to the number of lags.

Definition at line 8007 of file data_set.cpp.

◆ transform_time_series_columns()

void transform_time_series_columns ( )

This method transforms the columns into time series for forecasting problems.

Definition at line 790 of file data_set.cpp.

◆ transform_time_series_data()

void transform_time_series_data ( )

Definition at line 870 of file data_set.cpp.

◆ unscale_data()

void unscale_data ( const Tensor< Descriptives, 1 > & variables_descriptives )

Definition at line 6197 of file data_set.cpp.

◆ unscale_input_variables()

void unscale_input_variables ( const Tensor< Descriptives, 1 > & input_variables_descriptives )

It unscales every input variable with the given method. The method to be used is that in the scaling and unscaling method variable.

Definition at line 6351 of file data_set.cpp.

◆ unscale_target_variables()

void unscale_target_variables ( const Tensor< Descriptives, 1 > & targets_descriptives )

It unscales the input variables with that values. The method to be used is that in the scaling and unscaling method variable.

Definition at line 6397 of file data_set.cpp.

◆ unuse_constant_columns()

Tensor< string, 1 > unuse_constant_columns ( )

Removes the input of target indices of that variables with zero standard deviation. It might change the size of the vectors containing the inputs and targets indices.

Definition at line 5073 of file data_set.cpp.

◆ unuse_repeated_samples()

Tensor< Index, 1 > unuse_repeated_samples ( )

Removes the training, selection and testing indices of that samples which are repeated in the data matrix. It might change the size of the vectors containing the training, selection and testing indices.

Definition at line 5112 of file data_set.cpp.

◆ unuse_Tukey_outliers()

void unuse_Tukey_outliers ( const type & cleaning_parameter = type(1.5) )

Calculate the outliers from the data set using the Tukey's test and sets in samples object.

Parameters

cleaning_parameter Parameter used to detect outliers

Todo:

Definition at line 8624 of file data_set.cpp.

◆ unuse_uncorrelated_columns()

Tensor< string, 1 > unuse_uncorrelated_columns ( const type & minimum_correlation = type(0.25) )

Return unused variables without correlation.

Parameters

minimum_correlation Minimum correlation between variables.

Definition at line 5163 of file data_set.cpp.

◆ write_XML()

void write_XML ( tinyxml2::XMLPrinter & file_stream ) const

Serializes the data set object into a XML document of the TinyXML library without keep the DOM tree in memory.

Definition at line 6486 of file data_set.cpp.

Member Data Documentation

◆ columns

Tensor<Column, 1> columns

private

Definition at line 764 of file data_set.h.

◆ columns_missing_values_number

Tensor<Index, 1> columns_missing_values_number

private

Definition at line 826 of file data_set.h.

◆ data

Tensor<type, 2> data

private

Data Matrix. The number of rows is the number of samples. The number of columns is the number of variables.

Definition at line 754 of file data_set.h.

◆ data_file_name

string data_file_name

private

Data file name.

Definition at line 772 of file data_set.h.

◆ data_file_preview

Tensor<Tensor<string, 1>, 1> data_file_preview

private

Definition at line 790 of file data_set.h.

◆ display

bool display = true

private

Display messages to screen.

Definition at line 832 of file data_set.h.

◆ gmt

Index gmt = 0

private

Definition at line 814 of file data_set.h.

◆ has_columns_names

bool has_columns_names = false

private

Header which contains variables name.

Definition at line 784 of file data_set.h.

◆ has_rows_labels

bool has_rows_labels = false

private

Header which contains the rows label.

Definition at line 788 of file data_set.h.

◆ input_variables_dimensions

Tensor<Index, 1> input_variables_dimensions

private

Definition at line 766 of file data_set.h.

◆ lags_number

Index lags_number = 0

private

Number of lags.

Definition at line 800 of file data_set.h.

◆ missing_values_label

string missing_values_label = "NA"

private

Missing values label.

Definition at line 780 of file data_set.h.

◆ missing_values_method

MissingValuesMethod missing_values_method = MissingValuesMethod::Unuse

private

Missing values method.

Definition at line 820 of file data_set.h.

◆ missing_values_number

Index missing_values_number

private

Missing values.

Definition at line 824 of file data_set.h.

◆ non_blocking_thread_pool

NonBlockingThreadPool* non_blocking_thread_pool = nullptr

private

Definition at line 745 of file data_set.h.

◆ product_vector_vector

const Eigen::array<IndexPair<Index>, 1> product_vector_vector = {IndexPair<Index>(0, 0)}

private

Definition at line 834 of file data_set.h.

◆ rows_labels

Tensor<string, 1> rows_labels

private

Definition at line 760 of file data_set.h.

◆ rows_missing_values_number

Index rows_missing_values_number

private

Definition at line 828 of file data_set.h.

◆ samples_uses

Tensor<SampleUse, 1> samples_uses

private

Definition at line 758 of file data_set.h.

◆ separator

Separator separator = Separator::Comma

private

Separator character.

Definition at line 776 of file data_set.h.

◆ steps_ahead

Index steps_ahead = 0

private

Number of steps ahead.

Definition at line 804 of file data_set.h.

◆ thread_pool_device

ThreadPoolDevice* thread_pool_device = nullptr

private

Definition at line 746 of file data_set.h.

◆ time_column

string time_column

private

Index where time variable is located for forecasting applications.

Definition at line 796 of file data_set.h.

◆ time_series_columns

Tensor<Column, 1> time_series_columns

private

Definition at line 812 of file data_set.h.

◆ time_series_data

Tensor<type, 2> time_series_data

private

Time series data matrix. The number of rows is the number of samples before time series transformation. The number of columns is the number of variables before time series transformation.

Definition at line 810 of file data_set.h.

The documentation for this class was generated from the following files:

Classes

Public Types

Public Member Functions

Static Public Member Functions

Private Member Functions

Private Attributes

Detailed Description

Member Enumeration Documentation

◆ ColumnType

◆ MissingValuesMethod

◆ ProjectType

◆ SampleUse

◆ Separator

◆ VariableUse

Constructor & Destructor Documentation

◆ DataSet() [1/5]

◆ DataSet() [2/5]

◆ DataSet() [3/5]

◆ DataSet() [4/5]

◆ DataSet() [5/5]

◆ ~DataSet()

Member Function Documentation

◆ calculate_autocorrelations()

◆ calculate_average_forest_paths()

◆ calculate_average_reachability()

◆ calculate_bounding_boxes_neighbors()

◆ calculate_columns_box_plots()

◆ calculate_columns_descriptives_categories()

◆ calculate_columns_descriptives_negative_samples()

◆ calculate_columns_descriptives_positive_samples()

◆ calculate_columns_descriptives_selection_samples()

◆ calculate_columns_descriptives_training_samples()

◆ calculate_columns_distribution()

◆ calculate_cross_correlations()

◆ calculate_distance_matrix()

◆ calculate_euclidean_distance()

◆ calculate_input_columns_correlations()

◆ calculate_input_target_columns_correlations()

◆ calculate_input_variables_descriptives()

◆ calculate_input_variables_maximums()

◆ calculate_input_variables_minimums()

◆ calculate_isolation_forest_outliers()

◆ calculate_k_nearest_neighbors()

◆ calculate_kd_tree_neighbors()

◆ calculate_local_outlier_factor()

◆ calculate_local_outlier_factor_outliers()

◆ calculate_min_max_indices_list()

◆ calculate_selection_negatives()

◆ calculate_selection_targets_mean()

◆ calculate_target_distribution()

◆ calculate_target_variables_descriptives()

◆ calculate_target_variables_maximums()

◆ calculate_target_variables_minimums()

◆ calculate_testing_negatives()

◆ calculate_testing_target_variables_descriptives()

◆ calculate_training_negatives()

◆ calculate_tree_path()

◆ calculate_Tukey_outliers()

◆ calculate_used_negatives()

◆ calculate_used_targets_mean()

◆ calculate_used_variables_descriptives()

◆ calculate_used_variables_minimums()

◆ calculate_variables_descriptives()

◆ calculate_variables_means()

◆ check_constant_columns()

◆ check_input_csv()

◆ check_separators()

◆ check_special_characters()

◆ count_nan()

◆ count_nan_columns()

◆ count_rows_with_nan()

◆ create_bounding_limits_kd_tree()

◆ create_isolation_forest()

◆ create_isolation_tree()

◆ create_kd_tree()

◆ filter_data()

◆ fix_repeated_names()

◆ from_XML()

◆ generate_constant_data()

◆ generate_random_data()