generated from AfonsoCMSousa/CPP-Template
92 lines
2.3 KiB
C++
92 lines
2.3 KiB
C++
#include <iostream>
|
|
#include <vector>
|
|
|
|
// File I/O functions
|
|
#include "io.cpp"
|
|
|
|
// Tokenizer functions
|
|
#include "token.cpp"
|
|
|
|
using namespace std;
|
|
|
|
const char *VERSION = "1.0.0";
|
|
|
|
char showTokens = 0;
|
|
char FILE_PATH[1024];
|
|
|
|
static inline void process_args(int argc, char **args) {
|
|
for (int i = 0; i < argc; i++) {
|
|
if (std::string(args[i]) == "--help" || std::string(args[i]) == "-h") {
|
|
std::cout << "Usage: " << args[0] << " [options]\n";
|
|
std::cout << "Options:\n";
|
|
std::cout << " --help -h\tShow this help message\n";
|
|
std::cout << " --version -v\tShow version information\n";
|
|
std::cout << " --input -i\tSpecify input file path\n";
|
|
exit(0);
|
|
}
|
|
|
|
if (std::string(args[i]) == "--version" || std::string(args[i]) == "-v") {
|
|
std::cout << "Version " << VERSION << "\n";
|
|
exit(0);
|
|
}
|
|
|
|
if (std::string(args[i]) == "--input" || std::string(args[i]) == "-i") {
|
|
if (i + 1 < argc) {
|
|
i++;
|
|
strncpy(FILE_PATH, args[i], sizeof(FILE_PATH) - 1);
|
|
FILE_PATH[sizeof(FILE_PATH) - 1] = '\0';
|
|
} else {
|
|
std::cerr << "Error: --input option requires a file path\n";
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
if (std::string(args[i]) == "--show-tokens") {
|
|
showTokens = 1;
|
|
}
|
|
}
|
|
|
|
if (FILE_PATH[0] == '\0') {
|
|
std::cerr << "Error: No input file specified. Use --input <file_path> to specify an input file.\n";
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
int main(int argc, char **args) {
|
|
process_args(argc, args);
|
|
vector<vector<char>> _file_content;
|
|
|
|
try {
|
|
_file_content = read_char(FILE_PATH);
|
|
} catch (const std::exception &e) {
|
|
std::cerr << "Error: [" << e.what() << "]\n";
|
|
return 1;
|
|
}
|
|
cout << "File read successfully. Total lines: " << _file_content.size() << "\n";
|
|
cout << "First line (as chars): ";
|
|
if (!_file_content.empty()) {
|
|
for (const auto &ch : _file_content[0]) {
|
|
printf("%c", ch);
|
|
}
|
|
cout << "\n";
|
|
}
|
|
|
|
// NOTE: Tokenizer here
|
|
vector<string> tokens = lexer(_file_content);
|
|
Vocabulary vocab = tokenizer(tokens);
|
|
|
|
// NOTE: Pretty print the vocab mapping
|
|
if (showTokens) {
|
|
cout << "Vocabulary Mapping:\n";
|
|
for (const auto &pair : vocab.value_map) {
|
|
cout << "Token: ID: [";
|
|
printf("%03llu", pair.second);
|
|
cout << "] -> " << type_to_string(vocab.type_map.at(pair.first)) << "\tToken: [" << pair.first << "]" << "\n";
|
|
}
|
|
}
|
|
|
|
// NOTE: Encode tokens to IDs
|
|
vector<VALUE_ID> encoded_tokens = encoder(tokens, vocab);
|
|
return 0;
|
|
}
|