#include #include // File I/O functions #include "io.cpp" // Tokenizer functions #include "token.cpp" using namespace std; const char *VERSION = "1.0.0"; char showTokens = 0; char FILE_PATH[1024]; static inline void process_args(int argc, char **args) { for (int i = 0; i < argc; i++) { if (std::string(args[i]) == "--help" || std::string(args[i]) == "-h") { std::cout << "Usage: " << args[0] << " [options]\n"; std::cout << "Options:\n"; std::cout << " --help -h\tShow this help message\n"; std::cout << " --version -v\tShow version information\n"; std::cout << " --input -i\tSpecify input file path\n"; exit(0); } if (std::string(args[i]) == "--version" || std::string(args[i]) == "-v") { std::cout << "Version " << VERSION << "\n"; exit(0); } if (std::string(args[i]) == "--input" || std::string(args[i]) == "-i") { if (i + 1 < argc) { i++; strncpy(FILE_PATH, args[i], sizeof(FILE_PATH) - 1); FILE_PATH[sizeof(FILE_PATH) - 1] = '\0'; } else { std::cerr << "Error: --input option requires a file path\n"; exit(1); } } if (std::string(args[i]) == "--show-tokens") { showTokens = 1; } } if (FILE_PATH[0] == '\0') { std::cerr << "Error: No input file specified. Use --input to specify an input file.\n"; exit(1); } } int main(int argc, char **args) { process_args(argc, args); vector> _file_content; try { _file_content = read_char(FILE_PATH); } catch (const std::exception &e) { std::cerr << "Error: [" << e.what() << "]\n"; return 1; } cout << "File read successfully. Total lines: " << _file_content.size() << "\n"; cout << "First line (as chars): "; if (!_file_content.empty()) { for (const auto &ch : _file_content[0]) { printf("%c", ch); } cout << "\n"; } // NOTE: Tokenizer here vector tokens = lexer(_file_content); Vocabulary vocab = tokenizer(tokens); // NOTE: Pretty print the vocab mapping if (showTokens) { cout << "Vocabulary Mapping:\n"; for (const auto &pair : vocab.value_map) { cout << "Token: ID: ["; printf("%03llu", pair.second); cout << "] -> " << type_to_string(vocab.type_map.at(pair.first)) << "\tToken: [" << pair.first << "]" << "\n"; } } // NOTE: Encode tokens to IDs vector encoded_tokens = encoder(tokens, vocab); return 0; }