28 LanguageDataset(
const filesystem::path&, Index maximum_vocabulary_size, Index minimum_token_frequency);
76 bool parallelize =
true,
77 int = -1)
const override;
84 bool parallelize =
true,
85 int = -1)
const override;
92 bool parallelize =
true,
93 int = -1)
const override;
108 void update_input_vocabulary_map();
109 void update_target_vocabulary_maps();
111 unordered_map<string_view, Index> create_vocabulary_map(
const vector<string>& vocabulary)
const;
113 void load_documents(
string& buffer,
114 vector<vector<string_view>>& input_documents,
115 vector<vector<string_view>>& target_documents,
116 bool has_header_line,
117 bool strict_field_count)
const;
119 void encode_streaming(
const vector<vector<string_view>>&,
120 const vector<vector<string_view>>&,
121 vector<vector<Index>>& in_idx,
122 vector<vector<Index>>& tgt_idx)
const;
124 void write_binary_cache(
const vector<vector<Index>>& in_idx,
125 const vector<vector<Index>>& tgt_idx,
128 bool try_load_binary_cache(Index expected_samples);
130 vector<string> input_vocabulary;
131 vector<string> target_vocabulary;
133 unordered_map<string, Index> input_vocabulary_map;
134 unordered_map<string, Index> target_vocabulary_map;
135 unordered_map<Index, string> target_inverse_vocabulary_map;
137 Index maximum_input_sequence_length = 0;
138 Index maximum_target_sequence_length = 0;
140 Index minimum_token_frequency = 1;
141 Index maximum_vocabulary_size = 20000;
145 filesystem::path cache_path;
147 uint64_t cache_data_off_ = 0;
148 vector<array<int64_t, 4>> offsets_table;
virtual Index get_samples_number() const
Returns the total number of samples (rows) in the data matrix.
Definition dataset.h:74
Thread-safe positional file reader (pread on POSIX, overlapped ReadFile on Windows).
Definition io_utilities.h:20
VectorI calculate_target_distribution() const override
Returns the distribution of target tokens or classes.
static const float UNK_INDEX
Definition language_dataset.h:100
const unordered_map< Index, string > & get_target_inverse_vocabulary_map() const
Definition language_dataset.h:40
void fill_inputs(const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override
Streams input token indices of the selected samples into the destination buffer.
const vector< string > & get_input_vocabulary() const
Definition language_dataset.h:32
const vector< string > & get_target_vocabulary() const
Definition language_dataset.h:33
LanguageDataset(const filesystem::path &="")
Creates a language dataset, optionally reading documents from the given path.
void set_input_vocabulary(const vector< string > &)
Replaces the input vocabulary and rebuilds its lookup map.
void fill_decoder(const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override
Streams decoder token indices (target shifted right) into the destination buffer.
static const float START_INDEX
Definition language_dataset.h:101
void to_JSON(JsonWriter &) const override
Writes dataset state to a JSON writer.
void from_JSON(const JsonDocument &) override
Loads dataset state from a JSON document.
static const string UNK_TOKEN
Definition language_dataset.h:96
void set_target_vocabulary(const vector< string > &)
Replaces the target vocabulary and rebuilds its lookup maps.
const unordered_map< string, Index > & get_target_vocabulary_map() const
Definition language_dataset.h:39
void read_txt()
Reads the configured text corpus into the dataset and builds vocabularies.
void fill_targets(const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override
Streams target token indices of the selected samples into the destination buffer.
static const vector< string > reserved_tokens
Definition language_dataset.h:104
static const float END_INDEX
Definition language_dataset.h:102
LanguageDataset(const Index, Index, Index)
Creates a synthetic language dataset with the given sample count and vocabulary controls.
static const string PAD_TOKEN
Definition language_dataset.h:95
void set_maximum_vocabulary_size(Index new_maximum)
Definition language_dataset.h:50
void create_vocabulary(const vector< vector< string_view > > &, vector< string > &) const
Builds a vocabulary from the tokenized documents, filtered by frequency and size limits.
Index get_target_vocabulary_size() const
Definition language_dataset.h:36
LanguageDataset(const filesystem::path &, Index maximum_vocabulary_size, Index minimum_token_frequency)
Creates a language dataset with vocabulary size and minimum token frequency filters.
void set_minimum_token_frequency(Index new_minimum)
Definition language_dataset.h:51
static const string START_TOKEN
Definition language_dataset.h:97
Index get_maximum_input_sequence_length() const
Definition language_dataset.h:42
LanguageDataset(const filesystem::path &, Index maximum_vocabulary_size)
Creates a language dataset capped at the given vocabulary size.
Index get_input_vocabulary_size() const
Definition language_dataset.h:35
Index get_samples_number() const override
Returns the number of token sequences in the dataset.
Index get_maximum_target_sequence_length() const
Definition language_dataset.h:43
static const string END_TOKEN
Definition language_dataset.h:98
const unordered_map< string, Index > & get_input_vocabulary_map() const
Definition language_dataset.h:38
Definition adaptive_moment_estimation.h:14
Matrix< Index, Dynamic, 1 > VectorI
Definition pch.h:182