OpenNN
Open-source neural networks library
Loading...
Searching...
No Matches
language_dataset.h
Go to the documentation of this file.
1// OpenNN: Open Neural Networks Library
2// www.opennn.net
3//
4// L A N G U A G E D A T A S E T C L A S S H E A D E R
5//
6// Artificial Intelligence Techniques SL
7// artelnics@artelnics.com
8
9#pragma once
10
11#include "dataset.h"
12#include "io_utilities.h"
13
14namespace opennn
15{
16
18class LanguageDataset final : public Dataset
19{
20
21public:
22
24 LanguageDataset(const filesystem::path& = "");
26 LanguageDataset(const filesystem::path&, Index maximum_vocabulary_size);
28 LanguageDataset(const filesystem::path&, Index maximum_vocabulary_size, Index minimum_token_frequency);
30 LanguageDataset(const Index, Index, Index);
31
32 const vector<string>& get_input_vocabulary() const { return input_vocabulary; }
33 const vector<string>& get_target_vocabulary() const { return target_vocabulary; }
34
35 Index get_input_vocabulary_size() const { return input_vocabulary.size(); }
36 Index get_target_vocabulary_size() const { return target_vocabulary.size(); }
37
38 const unordered_map<string, Index>& get_input_vocabulary_map() const { return input_vocabulary_map; }
39 const unordered_map<string, Index>& get_target_vocabulary_map() const { return target_vocabulary_map; }
40 const unordered_map<Index, string>& get_target_inverse_vocabulary_map() const { return target_inverse_vocabulary_map; }
41
42 Index get_maximum_input_sequence_length() const { return maximum_input_sequence_length; }
43 Index get_maximum_target_sequence_length() const { return maximum_target_sequence_length; }
44
46 void set_input_vocabulary(const vector<string>&);
48 void set_target_vocabulary(const vector<string>&);
49
50 void set_maximum_vocabulary_size(Index new_maximum) { maximum_vocabulary_size = new_maximum; }
51 void set_minimum_token_frequency(Index new_minimum) { minimum_token_frequency = new_minimum; }
52
54 Index get_samples_number() const override;
56
59
61 void read_txt();
62
66 void create_vocabulary(const vector<vector<string_view>>&, vector<string>&) const;
67
68 void from_JSON(const JsonDocument&) override;
69 void to_JSON(JsonWriter&) const override;
70
72 void fill_inputs(const vector<Index>&,
73 const vector<Index>&,
74 float*,
75 bool is_training,
76 bool parallelize = true,
77 int = -1) const override;
78
80 void fill_targets(const vector<Index>&,
81 const vector<Index>&,
82 float*,
83 bool is_training,
84 bool parallelize = true,
85 int = -1) const override;
86
88 void fill_decoder(const vector<Index>&,
89 const vector<Index>&,
90 float*,
91 bool is_training,
92 bool parallelize = true,
93 int = -1) const override;
94
95 inline static const string PAD_TOKEN = "[PAD]"; // 0
96 inline static const string UNK_TOKEN = "[UNK]"; // 1
97 inline static const string START_TOKEN = "[START]"; // 2
98 inline static const string END_TOKEN = "[END]"; // 3
99
100 inline static const float UNK_INDEX = 1.0f;
101 inline static const float START_INDEX = 2.0f;
102 inline static const float END_INDEX = 3.0f;
103
104 inline static const vector<string> reserved_tokens = {PAD_TOKEN, UNK_TOKEN, START_TOKEN, END_TOKEN};
105
106private:
107
108 void update_input_vocabulary_map();
109 void update_target_vocabulary_maps();
110
111 unordered_map<string_view, Index> create_vocabulary_map(const vector<string>& vocabulary) const;
112
113 void load_documents(string& buffer,
114 vector<vector<string_view>>& input_documents,
115 vector<vector<string_view>>& target_documents,
116 bool has_header_line,
117 bool strict_field_count) const;
118
119 void encode_streaming(const vector<vector<string_view>>&,
120 const vector<vector<string_view>>&,
121 vector<vector<Index>>& in_idx,
122 vector<vector<Index>>& tgt_idx) const;
123
124 void write_binary_cache(const vector<vector<Index>>& in_idx,
125 const vector<vector<Index>>& tgt_idx,
126 bool has_decoder);
127
128 bool try_load_binary_cache(Index expected_samples);
129
130 vector<string> input_vocabulary;
131 vector<string> target_vocabulary;
132
133 unordered_map<string, Index> input_vocabulary_map;
134 unordered_map<string, Index> target_vocabulary_map;
135 unordered_map<Index, string> target_inverse_vocabulary_map;
136
137 Index maximum_input_sequence_length = 0;
138 Index maximum_target_sequence_length = 0;
139
140 Index minimum_token_frequency = 1;
141 Index maximum_vocabulary_size = 20000;
142
143 // Binary streaming cache: <data_path>.cache/tokens.bin
144 // Header(64B) + offsets table (int64[N][4]) + concat int32 tokens.
145 filesystem::path cache_path;
146 mutable FileReader cache_reader;
147 uint64_t cache_data_off_ = 0;
148 vector<array<int64_t, 4>> offsets_table; // (in_off, in_len, tgt_off, tgt_len)
149};
150
151}
152
153// OpenNN: Open Neural Networks Library.
154// Copyright(C) 2005-2026 Artificial Intelligence Techniques, SL.
155// Licensed under the GNU Lesser General Public License v2.1 or later.
Dataset()=default
virtual Index get_samples_number() const
Returns the total number of samples (rows) in the data matrix.
Definition dataset.h:74
Thread-safe positional file reader (pread on POSIX, overlapped ReadFile on Windows).
Definition io_utilities.h:20
Definition json.h:72
Definition json.h:85
VectorI calculate_target_distribution() const override
Returns the distribution of target tokens or classes.
static const float UNK_INDEX
Definition language_dataset.h:100
const unordered_map< Index, string > & get_target_inverse_vocabulary_map() const
Definition language_dataset.h:40
void fill_inputs(const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override
Streams input token indices of the selected samples into the destination buffer.
const vector< string > & get_input_vocabulary() const
Definition language_dataset.h:32
const vector< string > & get_target_vocabulary() const
Definition language_dataset.h:33
LanguageDataset(const filesystem::path &="")
Creates a language dataset, optionally reading documents from the given path.
void set_input_vocabulary(const vector< string > &)
Replaces the input vocabulary and rebuilds its lookup map.
void fill_decoder(const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override
Streams decoder token indices (target shifted right) into the destination buffer.
static const float START_INDEX
Definition language_dataset.h:101
void to_JSON(JsonWriter &) const override
Writes dataset state to a JSON writer.
void from_JSON(const JsonDocument &) override
Loads dataset state from a JSON document.
static const string UNK_TOKEN
Definition language_dataset.h:96
void set_target_vocabulary(const vector< string > &)
Replaces the target vocabulary and rebuilds its lookup maps.
const unordered_map< string, Index > & get_target_vocabulary_map() const
Definition language_dataset.h:39
void read_txt()
Reads the configured text corpus into the dataset and builds vocabularies.
void fill_targets(const vector< Index > &, const vector< Index > &, float *, bool is_training, bool parallelize=true, int=-1) const override
Streams target token indices of the selected samples into the destination buffer.
static const vector< string > reserved_tokens
Definition language_dataset.h:104
static const float END_INDEX
Definition language_dataset.h:102
LanguageDataset(const Index, Index, Index)
Creates a synthetic language dataset with the given sample count and vocabulary controls.
static const string PAD_TOKEN
Definition language_dataset.h:95
void set_maximum_vocabulary_size(Index new_maximum)
Definition language_dataset.h:50
void create_vocabulary(const vector< vector< string_view > > &, vector< string > &) const
Builds a vocabulary from the tokenized documents, filtered by frequency and size limits.
Index get_target_vocabulary_size() const
Definition language_dataset.h:36
LanguageDataset(const filesystem::path &, Index maximum_vocabulary_size, Index minimum_token_frequency)
Creates a language dataset with vocabulary size and minimum token frequency filters.
void set_minimum_token_frequency(Index new_minimum)
Definition language_dataset.h:51
static const string START_TOKEN
Definition language_dataset.h:97
Index get_maximum_input_sequence_length() const
Definition language_dataset.h:42
LanguageDataset(const filesystem::path &, Index maximum_vocabulary_size)
Creates a language dataset capped at the given vocabulary size.
Index get_input_vocabulary_size() const
Definition language_dataset.h:35
Index get_samples_number() const override
Returns the number of token sequences in the dataset.
Index get_maximum_target_sequence_length() const
Definition language_dataset.h:43
static const string END_TOKEN
Definition language_dataset.h:98
const unordered_map< string, Index > & get_input_vocabulary_map() const
Definition language_dataset.h:38
Definition adaptive_moment_estimation.h:14
Matrix< Index, Dynamic, 1 > VectorI
Definition pch.h:182