CTokenizerAnalizer/include/token.cpp

#ifndef TOKEN_CPP
#define TOKEN_CPP

#include <cctype>
#include <cstddef>
#include <cstdint>
#include <map>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>

typedef u_int64_t VALUE_ID;
typedef void *DATA;

enum class TYPE { KEYWORD, IDENTIFIER, NUMBER, STRING, CHAR, OPERATOR, PUNCTUATION, PREPROCESSOR, HEADER, COMMENT, END_OF_FILE, UNKNOWN };

typedef struct {
	TYPE type;
	VALUE_ID value;

	// INFO: This is only used for debugging, the analizer wont take this into account.
	// WARN: Removed: "This turns the tokenizer into a C-style unsafe container, not a dataset generator."
	// u_int64_t data_len;
	// DATA data;
} Token;

std::string type_to_string(TYPE type) {
	switch (type) {
	case TYPE::KEYWORD:
		return "KEYWORD";
	case TYPE::IDENTIFIER:
		return "IDENTIFIER";
	case TYPE::NUMBER:
		return "NUMBER";
	case TYPE::STRING:
		return "STRING\t";
	case TYPE::CHAR:
		return "CHAR\t";
	case TYPE::OPERATOR:
		return "OPERATOR";
	case TYPE::PUNCTUATION:
		return "PUNCTUATION";
	case TYPE::PREPROCESSOR:
		return "PREPROCESSOR";
	case TYPE::HEADER:
		return "HEADER\t";
	case TYPE::COMMENT:
		return "COMMENT\t";
	case TYPE::END_OF_FILE:
		return "END_OF_FILE";
	default:
		return "UNKNOWN\t";
	}
}

typedef struct {
	std::unordered_map<std::string, VALUE_ID> value_map;
	std::unordered_map<std::string, TYPE> type_map;
	VALUE_ID next_id;
} Vocabulary;

std::vector<std::string> lexer(std::vector<std::vector<char>> file_content) {
	std::vector<std::string> tokens;
	bool in_string = false;

	for (const auto &line : file_content) {
		bool is_preprocessor = false;
		for (const auto &ch : line) {
			if (ch == ' ' || ch == '\t')
				continue;
			if (ch == '#')
				is_preprocessor = true;
			break;
		}

		std::string token;
		for (size_t i = 0; i < line.size(); ++i) {
			char ch = line[i];

			if ((ch == '\n' || ch == '\0') || (ch == ' ' && !in_string)) {
				if (!token.empty()) {
					tokens.push_back(token);
					token.clear();
				}
			} else {
				if (ch == '"' || ch == '\'') {
					in_string = !in_string;
				}

				if ((ch == '<' || ch == '>') && !in_string) {
					if (is_preprocessor) {
						// For preprocessors, treat < and > as delimiters only (don't push them)
						if (!token.empty()) {
							tokens.push_back(token);
							token.clear();
						}
						continue;
					} else {
						// For regular code lines, treat them as valid operators
						char next_ch = (i + 1 < line.size()) ? line[i + 1] : '\0';
						if (next_ch == '<' || next_ch == '>' || next_ch == '=') {
							if (!token.empty()) {
								tokens.push_back(token);
								token.clear();
							}
							tokens.push_back(std::string(1, ch) + std::string(1, next_ch));
							i++; // Consume the next char
							continue;
						} else {
							if (!token.empty()) {
								tokens.push_back(token);
								token.clear();
							}
							tokens.push_back(std::string(1, ch));
							continue;
						}
					}
				}

				// Added '#' and '!' to punctuation/operator detection
				if ((ch == '#' || ch == '(' || ch == ')' || ch == ',' || ch == ';' || ch == '=' || ch == '!') && !in_string) {
					char next_ch = (i + 1 < line.size()) ? line[i + 1] : '\0';
					if (ch == '=' && next_ch == '=') {
						if (!token.empty()) {
							tokens.push_back(token);
							token.clear();
						}
						tokens.push_back("==");
						i++;
						continue;
					} else if (ch == '!' && next_ch == '=') {
						if (!token.empty()) {
							tokens.push_back(token);
							token.clear();
						}
						tokens.push_back("!=");
						i++;
						continue;
					}

					if (!token.empty()) {
						tokens.push_back(token);
						token.clear();
					}
					tokens.push_back(std::string(1, ch));
					continue;
				}

				token += ch;
			}
		}
		if (!token.empty()) {
			tokens.push_back(token);
		}
	}

	return tokens;
}

std::vector<VALUE_ID> encoder(std::vector<std::string> tokens, Vocabulary &vocab) {
	std::vector<VALUE_ID> encoded_tokens;
	for (const auto &token : tokens) {
		auto it = vocab.value_map.find(token);
		if (it != vocab.value_map.end()) {
			encoded_tokens.push_back(it->second);
		} else {
			// If the token is not in the vocabulary, we can choose to add it or skip it.
			// For this implementation, we'll skip it and assign a special ID (e.g., 0) for unknown tokens.
			encoded_tokens.push_back(0); // 0 for unknown tokens
		}
	}
	return encoded_tokens;
}

Vocabulary tokenizer(std::vector<std::string> tokens) {
	using namespace std;
	Vocabulary vocab;
	vocab.next_id = 1; // Start IDs from 1, reserve 0 for unknown/special tokens

	// 1. Define complete sets for C keywords, operators, and punctuation
	const unordered_set<string> keywords = {"auto",   "break",  "case",   "char",   "const",   "continue", "default",  "do",       "double",   "else",   "enum",  "extern",
	                                        "float",  "for",    "goto",   "if",     "inline",  "int",      "long",     "register", "restrict", "return", "short", "signed",
	                                        "sizeof", "static", "struct", "switch", "typedef", "union",    "unsigned", "void",     "volatile", "while"};

	const unordered_set<string> preprocessor_words = {"include", "define", "undef", "ifdef", "ifndef", "if", "elif", "else", "endif", "line", "error", "pragma"};

	const unordered_set<string> operators = {"+", "-", "*",  "/",  "%", "++", "--", "==", "!=", "<",  ">",  "<=", ">=", "&&",  "||",  "!",  "&", "|",
	                                         "^", "~", "<<", ">>", "=", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "->", "."};

	const unordered_set<string> punctuation = {"(", ")", "{", "}", "[", "]", ",", ";", ":", "?"};

	for (const auto &token : tokens) {
		if (token.empty())
			continue;

		// If the token hasn't been registered yet, assign it a unique ID and a type
		if (vocab.value_map.find(token) == vocab.value_map.end()) {
			vocab.value_map[token] = vocab.next_id++;

			// 2. Strict hierarchical classification logic
			if (token[0] == '/' && token.size() >= 2 && (token[1] == '/' || token[1] == '*')) {
				vocab.type_map[token] = TYPE::COMMENT;
			}
			// Check for HEADER files first before catching generic preprocessor tokens
			else if (token.size() > 2 && token.find(".h") != string::npos) {
				vocab.type_map[token] = TYPE::HEADER;
			} else if (token == "#" || preprocessor_words.count(token) > 0) {
				// Catches only '#' and 'include', 'define', etc.
				vocab.type_map[token] = TYPE::PREPROCESSOR;
			} else if (keywords.count(token) > 0) {
				// Checked BEFORE identifiers so 'int' isn't labeled as an identifier
				vocab.type_map[token] = TYPE::KEYWORD;
			} else if (token[0] == '"') {
				vocab.type_map[token] = TYPE::STRING;
			} else if (token[0] == '\'') {
				vocab.type_map[token] = TYPE::CHAR;
			} else if (isdigit(token[0]) || (token[0] == '.' && token.size() > 1 && isdigit(token[1]))) {
				vocab.type_map[token] = TYPE::NUMBER;
			} else if (operators.count(token) > 0) {
				vocab.type_map[token] = TYPE::OPERATOR;
			} else if (punctuation.count(token) > 0) {
				vocab.type_map[token] = TYPE::PUNCTUATION;
			} else if (isalpha(token[0]) || token[0] == '_') {
				vocab.type_map[token] = TYPE::IDENTIFIER;
			} else {
				vocab.type_map[token] = TYPE::UNKNOWN;
			}
		}
	}

	return vocab;
}
#endif // TOKEN_CPP