#ifndef TOKEN_CPP #define TOKEN_CPP #include #include #include #include #include #include #include #include typedef u_int64_t VALUE_ID; typedef void *DATA; enum class TYPE { KEYWORD, IDENTIFIER, NUMBER, STRING, CHAR, OPERATOR, PUNCTUATION, PREPROCESSOR, HEADER, COMMENT, END_OF_FILE, UNKNOWN }; typedef struct { TYPE type; VALUE_ID value; // INFO: This is only used for debugging, the analizer wont take this into account. // WARN: Removed: "This turns the tokenizer into a C-style unsafe container, not a dataset generator." // u_int64_t data_len; // DATA data; } Token; std::string type_to_string(TYPE type) { switch (type) { case TYPE::KEYWORD: return "KEYWORD"; case TYPE::IDENTIFIER: return "IDENTIFIER"; case TYPE::NUMBER: return "NUMBER"; case TYPE::STRING: return "STRING\t"; case TYPE::CHAR: return "CHAR\t"; case TYPE::OPERATOR: return "OPERATOR"; case TYPE::PUNCTUATION: return "PUNCTUATION"; case TYPE::PREPROCESSOR: return "PREPROCESSOR"; case TYPE::HEADER: return "HEADER\t"; case TYPE::COMMENT: return "COMMENT\t"; case TYPE::END_OF_FILE: return "END_OF_FILE"; default: return "UNKNOWN\t"; } } typedef struct { std::unordered_map value_map; std::unordered_map type_map; VALUE_ID next_id; } Vocabulary; std::vector lexer(std::vector> file_content) { std::vector tokens; bool in_string = false; for (const auto &line : file_content) { bool is_preprocessor = false; for (const auto &ch : line) { if (ch == ' ' || ch == '\t') continue; if (ch == '#') is_preprocessor = true; break; } std::string token; for (size_t i = 0; i < line.size(); ++i) { char ch = line[i]; if ((ch == '\n' || ch == '\0') || (ch == ' ' && !in_string)) { if (!token.empty()) { tokens.push_back(token); token.clear(); } } else { if (ch == '"' || ch == '\'') { in_string = !in_string; } if ((ch == '<' || ch == '>') && !in_string) { if (is_preprocessor) { // For preprocessors, treat < and > as delimiters only (don't push them) if (!token.empty()) { tokens.push_back(token); token.clear(); } continue; } else { // For regular code lines, treat them as valid operators char next_ch = (i + 1 < line.size()) ? line[i + 1] : '\0'; if (next_ch == '<' || next_ch == '>' || next_ch == '=') { if (!token.empty()) { tokens.push_back(token); token.clear(); } tokens.push_back(std::string(1, ch) + std::string(1, next_ch)); i++; // Consume the next char continue; } else { if (!token.empty()) { tokens.push_back(token); token.clear(); } tokens.push_back(std::string(1, ch)); continue; } } } // Added '#' and '!' to punctuation/operator detection if ((ch == '#' || ch == '(' || ch == ')' || ch == ',' || ch == ';' || ch == '=' || ch == '!') && !in_string) { char next_ch = (i + 1 < line.size()) ? line[i + 1] : '\0'; if (ch == '=' && next_ch == '=') { if (!token.empty()) { tokens.push_back(token); token.clear(); } tokens.push_back("=="); i++; continue; } else if (ch == '!' && next_ch == '=') { if (!token.empty()) { tokens.push_back(token); token.clear(); } tokens.push_back("!="); i++; continue; } if (!token.empty()) { tokens.push_back(token); token.clear(); } tokens.push_back(std::string(1, ch)); continue; } token += ch; } } if (!token.empty()) { tokens.push_back(token); } } return tokens; } std::vector encoder(std::vector tokens, Vocabulary &vocab) { std::vector encoded_tokens; for (const auto &token : tokens) { auto it = vocab.value_map.find(token); if (it != vocab.value_map.end()) { encoded_tokens.push_back(it->second); } else { // If the token is not in the vocabulary, we can choose to add it or skip it. // For this implementation, we'll skip it and assign a special ID (e.g., 0) for unknown tokens. encoded_tokens.push_back(0); // 0 for unknown tokens } } return encoded_tokens; } Vocabulary tokenizer(std::vector tokens) { using namespace std; Vocabulary vocab; vocab.next_id = 1; // Start IDs from 1, reserve 0 for unknown/special tokens // 1. Define complete sets for C keywords, operators, and punctuation const unordered_set keywords = {"auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int", "long", "register", "restrict", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while"}; const unordered_set preprocessor_words = {"include", "define", "undef", "ifdef", "ifndef", "if", "elif", "else", "endif", "line", "error", "pragma"}; const unordered_set operators = {"+", "-", "*", "/", "%", "++", "--", "==", "!=", "<", ">", "<=", ">=", "&&", "||", "!", "&", "|", "^", "~", "<<", ">>", "=", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "->", "."}; const unordered_set punctuation = {"(", ")", "{", "}", "[", "]", ",", ";", ":", "?"}; for (const auto &token : tokens) { if (token.empty()) continue; // If the token hasn't been registered yet, assign it a unique ID and a type if (vocab.value_map.find(token) == vocab.value_map.end()) { vocab.value_map[token] = vocab.next_id++; // 2. Strict hierarchical classification logic if (token[0] == '/' && token.size() >= 2 && (token[1] == '/' || token[1] == '*')) { vocab.type_map[token] = TYPE::COMMENT; } // Check for HEADER files first before catching generic preprocessor tokens else if (token.size() > 2 && token.find(".h") != string::npos) { vocab.type_map[token] = TYPE::HEADER; } else if (token == "#" || preprocessor_words.count(token) > 0) { // Catches only '#' and 'include', 'define', etc. vocab.type_map[token] = TYPE::PREPROCESSOR; } else if (keywords.count(token) > 0) { // Checked BEFORE identifiers so 'int' isn't labeled as an identifier vocab.type_map[token] = TYPE::KEYWORD; } else if (token[0] == '"') { vocab.type_map[token] = TYPE::STRING; } else if (token[0] == '\'') { vocab.type_map[token] = TYPE::CHAR; } else if (isdigit(token[0]) || (token[0] == '.' && token.size() > 1 && isdigit(token[1]))) { vocab.type_map[token] = TYPE::NUMBER; } else if (operators.count(token) > 0) { vocab.type_map[token] = TYPE::OPERATOR; } else if (punctuation.count(token) > 0) { vocab.type_map[token] = TYPE::PUNCTUATION; } else if (isalpha(token[0]) || token[0] == '_') { vocab.type_map[token] = TYPE::IDENTIFIER; } else { vocab.type_map[token] = TYPE::UNKNOWN; } } } return vocab; } #endif // TOKEN_CPP