documentation/reference/language__dataset_8h_source.html

//   OpenNN: Open Neural Networks Library

//   www.opennn.net

//

//   L A N G U A G E   D A T A S E T   C L A S S   H E A D E R

//

//   Artificial Intelligence Techniques SL

//   artelnics@artelnics.com


#pragma once


#include "dataset.h"


namespace opennn

{


class LanguageDataset final : public Dataset

{


public:


    LanguageDataset(const filesystem::path& path = "");

    LanguageDataset(const Index samples_number,

                    Index maximum_input_sequence_length,

                    Index maximum_target_sequence_length);


    const vector<string>& get_input_vocabulary() const { return input_vocabulary; }

    const vector<string>& get_target_vocabulary() const { return target_vocabulary; }


    Index get_input_vocabulary_size() const { return input_vocabulary.size(); }

    Index get_target_vocabulary_size() const { return target_vocabulary.size(); }


    Index get_maximum_input_sequence_length() const { return maximum_input_sequence_length; }

    Index get_maximum_target_sequence_length() const { return maximum_target_sequence_length; }


    void set_input_vocabulary(const vector<string>& new_vocabulary) { input_vocabulary = new_vocabulary; }

    void set_target_vocabulary(const vector<string>& new_vocabulary) { target_vocabulary = new_vocabulary; }


    void read_csv() override;


    void create_vocabulary(const vector<vector<string>>&, vector<string>&) const;


    void encode_input(const vector<vector<string>>&);

    void encode_decoder_target_sequence_to_sequence(const vector<vector<string>>&);

    void encode_target_classification(const vector<vector<string>>&);


    void from_JSON(const JsonDocument&) override;

    void to_JSON(JsonWriter&) const override;


    inline static const string PAD_TOKEN   = "[PAD]";

    inline static const string UNK_TOKEN   = "[UNK]";

    inline static const string START_TOKEN = "[START]";

    inline static const string END_TOKEN   = "[END]";


    inline static const float UNK_INDEX = 1.0f;

    inline static const float START_INDEX = 2.0f;

    inline static const float END_INDEX = 3.0f;


    inline static const vector<string> reserved_tokens = {PAD_TOKEN, UNK_TOKEN, START_TOKEN, END_TOKEN};


private:


    unordered_map<string, Index> create_vocabulary_map(const vector<string>& vocabulary);


    vector<string> input_vocabulary;

    vector<string> target_vocabulary;


    Index maximum_input_sequence_length = 0;

    Index maximum_target_sequence_length = 0;


    Index minimum_token_frequency = 1;

    Index maximum_vocabulary_size = 20000;

};


}


// OpenNN: Open Neural Networks Library.

// Copyright(C) 2005-2026 Artificial Intelligence Techniques, SL.

// Licensed under the GNU Lesser General Public License v2.1 or later.

opennn::Dataset::Dataset
Dataset(const Index samples_number=0, const Shape &input_shape={0}, const Shape &target_shape={0})
Constructs an empty dataset of given dimensions.

opennn::JsonDocument
Definition json.h:71

opennn::JsonWriter
Definition json.h:84

opennn::LanguageDataset::UNK_INDEX
static const float UNK_INDEX
Numeric id used for unknown tokens.
Definition language_dataset.h:132

opennn::LanguageDataset::LanguageDataset
LanguageDataset(const filesystem::path &path="")
Constructs a LanguageDataset, optionally loading from CSV.

opennn::LanguageDataset::get_input_vocabulary
const vector< string > & get_input_vocabulary() const
Read-only access to the input-side vocabulary.
Definition language_dataset.h:55

opennn::LanguageDataset::get_target_vocabulary
const vector< string > & get_target_vocabulary() const
Read-only access to the target-side vocabulary.
Definition language_dataset.h:57

opennn::LanguageDataset::LanguageDataset
LanguageDataset(const Index samples_number, Index maximum_input_sequence_length, Index maximum_target_sequence_length)
Constructs an empty LanguageDataset of given dimensions.

opennn::LanguageDataset::set_input_vocabulary
void set_input_vocabulary(const vector< string > &new_vocabulary)
Replaces the input vocabulary.
Definition language_dataset.h:73

opennn::LanguageDataset::encode_decoder_target_sequence_to_sequence
void encode_decoder_target_sequence_to_sequence(const vector< vector< string > > &)
Encodes decoder target tokens for sequence-to-sequence training.

opennn::LanguageDataset::START_INDEX
static const float START_INDEX
Numeric id used for the start-of-sequence token.
Definition language_dataset.h:134

opennn::LanguageDataset::to_JSON
void to_JSON(JsonWriter &) const override
Writes dataset metadata and vocabularies to a streaming JSON writer.

opennn::LanguageDataset::encode_target_classification
void encode_target_classification(const vector< vector< string > > &)
Encodes target tokens for classification training.

opennn::LanguageDataset::from_JSON
void from_JSON(const JsonDocument &) override
Loads dataset metadata and vocabularies from a parsed JSON document.

opennn::LanguageDataset::UNK_TOKEN
static const string UNK_TOKEN
Unknown token text (id 1).
Definition language_dataset.h:125

opennn::LanguageDataset::reserved_tokens
static const vector< string > reserved_tokens
Special tokens reserved at the start of every vocabulary.
Definition language_dataset.h:139

opennn::LanguageDataset::END_INDEX
static const float END_INDEX
Numeric id used for the end-of-sequence token.
Definition language_dataset.h:136

opennn::LanguageDataset::set_target_vocabulary
void set_target_vocabulary(const vector< string > &new_vocabulary)
Replaces the target vocabulary.
Definition language_dataset.h:78

opennn::LanguageDataset::create_vocabulary
void create_vocabulary(const vector< vector< string > > &, vector< string > &) const
Builds a vocabulary from a list of tokenized samples.

opennn::LanguageDataset::encode_input
void encode_input(const vector< vector< string > > &)
Encodes input tokens into ids and stores them in the dataset.

opennn::LanguageDataset::PAD_TOKEN
static const string PAD_TOKEN
Padding token text (id 0).
Definition language_dataset.h:123

opennn::LanguageDataset::get_target_vocabulary_size
Index get_target_vocabulary_size() const
Number of distinct tokens in the target vocabulary.
Definition language_dataset.h:62

opennn::LanguageDataset::START_TOKEN
static const string START_TOKEN
Start-of-sequence token text (id 2).
Definition language_dataset.h:127

opennn::LanguageDataset::read_csv
void read_csv() override
Reads tokens from the configured CSV file into the dataset.

opennn::LanguageDataset::get_maximum_input_sequence_length
Index get_maximum_input_sequence_length() const
Maximum input sequence length supported.
Definition language_dataset.h:65

opennn::LanguageDataset::get_input_vocabulary_size
Index get_input_vocabulary_size() const
Number of distinct tokens in the input vocabulary.
Definition language_dataset.h:60

opennn::LanguageDataset::get_maximum_target_sequence_length
Index get_maximum_target_sequence_length() const
Maximum target sequence length supported.
Definition language_dataset.h:67

opennn::LanguageDataset::END_TOKEN
static const string END_TOKEN
End-of-sequence token text (id 3).
Definition language_dataset.h:129

dataset.h
Declares the Dataset class and the SampleRole enum.

opennn
Definition adaptive_moment_estimation.h:19