92 lines
2.3 KiB
C++

#include <iostream>
#include <vector>
// File I/O functions
#include "io.cpp"
// Tokenizer functions
#include "token.cpp"
using namespace std;
const char *VERSION = "1.0.0";
char showTokens = 0;
char FILE_PATH[1024];
static inline void process_args(int argc, char **args) {
for (int i = 0; i < argc; i++) {
if (std::string(args[i]) == "--help" || std::string(args[i]) == "-h") {
std::cout << "Usage: " << args[0] << " [options]\n";
std::cout << "Options:\n";
std::cout << " --help -h\tShow this help message\n";
std::cout << " --version -v\tShow version information\n";
std::cout << " --input -i\tSpecify input file path\n";
exit(0);
}
if (std::string(args[i]) == "--version" || std::string(args[i]) == "-v") {
std::cout << "Version " << VERSION << "\n";
exit(0);
}
if (std::string(args[i]) == "--input" || std::string(args[i]) == "-i") {
if (i + 1 < argc) {
i++;
strncpy(FILE_PATH, args[i], sizeof(FILE_PATH) - 1);
FILE_PATH[sizeof(FILE_PATH) - 1] = '\0';
} else {
std::cerr << "Error: --input option requires a file path\n";
exit(1);
}
}
if (std::string(args[i]) == "--show-tokens") {
showTokens = 1;
}
}
if (FILE_PATH[0] == '\0') {
std::cerr << "Error: No input file specified. Use --input <file_path> to specify an input file.\n";
exit(1);
}
}
int main(int argc, char **args) {
process_args(argc, args);
vector<vector<char>> _file_content;
try {
_file_content = read_char(FILE_PATH);
} catch (const std::exception &e) {
std::cerr << "Error: [" << e.what() << "]\n";
return 1;
}
cout << "File read successfully. Total lines: " << _file_content.size() << "\n";
cout << "First line (as chars): ";
if (!_file_content.empty()) {
for (const auto &ch : _file_content[0]) {
printf("%c", ch);
}
cout << "\n";
}
// NOTE: Tokenizer here
vector<string> tokens = lexer(_file_content);
Vocabulary vocab = tokenizer(tokens);
// NOTE: Pretty print the vocab mapping
if (showTokens) {
cout << "Vocabulary Mapping:\n";
for (const auto &pair : vocab.value_map) {
cout << "Token: ID: [";
printf("%03llu", pair.second);
cout << "] -> " << type_to_string(vocab.type_map.at(pair.first)) << "\tToken: [" << pair.first << "]" << "\n";
}
}
// NOTE: Encode tokens to IDs
vector<VALUE_ID> encoded_tokens = encoder(tokens, vocab);
return 0;
}