51 Index maximum_input_sequence_length,
52 Index maximum_target_sequence_length);
148 unordered_map<string, Index> create_vocabulary_map(
const vector<string>& vocabulary);
151 vector<string> input_vocabulary;
153 vector<string> target_vocabulary;
156 Index maximum_input_sequence_length = 0;
158 Index maximum_target_sequence_length = 0;
161 Index minimum_token_frequency = 1;
163 Index maximum_vocabulary_size = 20000;
Dataset(const Index samples_number=0, const Shape &input_shape={0}, const Shape &target_shape={0})
Constructs an empty dataset of given dimensions.
static const float UNK_INDEX
Numeric id used for unknown tokens.
Definition language_dataset.h:132
LanguageDataset(const filesystem::path &path="")
Constructs a LanguageDataset, optionally loading from CSV.
const vector< string > & get_input_vocabulary() const
Read-only access to the input-side vocabulary.
Definition language_dataset.h:55
const vector< string > & get_target_vocabulary() const
Read-only access to the target-side vocabulary.
Definition language_dataset.h:57
LanguageDataset(const Index samples_number, Index maximum_input_sequence_length, Index maximum_target_sequence_length)
Constructs an empty LanguageDataset of given dimensions.
void set_input_vocabulary(const vector< string > &new_vocabulary)
Replaces the input vocabulary.
Definition language_dataset.h:73
void encode_decoder_target_sequence_to_sequence(const vector< vector< string > > &)
Encodes decoder target tokens for sequence-to-sequence training.
static const float START_INDEX
Numeric id used for the start-of-sequence token.
Definition language_dataset.h:134
void to_JSON(JsonWriter &) const override
Writes dataset metadata and vocabularies to a streaming JSON writer.
void encode_target_classification(const vector< vector< string > > &)
Encodes target tokens for classification training.
void from_JSON(const JsonDocument &) override
Loads dataset metadata and vocabularies from a parsed JSON document.
static const string UNK_TOKEN
Unknown token text (id 1).
Definition language_dataset.h:125
static const vector< string > reserved_tokens
Special tokens reserved at the start of every vocabulary.
Definition language_dataset.h:139
static const float END_INDEX
Numeric id used for the end-of-sequence token.
Definition language_dataset.h:136
void set_target_vocabulary(const vector< string > &new_vocabulary)
Replaces the target vocabulary.
Definition language_dataset.h:78
void create_vocabulary(const vector< vector< string > > &, vector< string > &) const
Builds a vocabulary from a list of tokenized samples.
void encode_input(const vector< vector< string > > &)
Encodes input tokens into ids and stores them in the dataset.
static const string PAD_TOKEN
Padding token text (id 0).
Definition language_dataset.h:123
Index get_target_vocabulary_size() const
Number of distinct tokens in the target vocabulary.
Definition language_dataset.h:62
static const string START_TOKEN
Start-of-sequence token text (id 2).
Definition language_dataset.h:127
void read_csv() override
Reads tokens from the configured CSV file into the dataset.
Index get_maximum_input_sequence_length() const
Maximum input sequence length supported.
Definition language_dataset.h:65
Index get_input_vocabulary_size() const
Number of distinct tokens in the input vocabulary.
Definition language_dataset.h:60
Index get_maximum_target_sequence_length() const
Maximum target sequence length supported.
Definition language_dataset.h:67
static const string END_TOKEN
End-of-sequence token text (id 3).
Definition language_dataset.h:129
Declares the Dataset class and the SampleRole enum.
Definition adaptive_moment_estimation.h:19