feat: tokenizer, encoder and parser

2026-05-21 17:31:15 +01:00 · 2026-05-21 17:31:15 +01:00 · 9f0b253452
commit 9f0b253452
parent 5496e9386d
6 changed files with 398 additions and 5 deletions
--- a/compile_commands.json
+++ b/compile_commands.json
@ -0,0 +1 @@
 /Users/AfonsoCMSosua/Developer/C++/CTokenizerAnalizer/build/compile_commands.json
--- a/include/io.cpp
+++ b/include/io.cpp
@ -0,0 +1,55 @@
 /*
 *  io.cpp
 *
 *  - This file contains basic funcions to read files and parse them
 *
 *  - Taken from AfonsoCMSousa (me) advent of code repository:
 *  https://github.com/AfonsoCMSousa/AdventOfCode
 *
 *  Note: The funtions were changed to better suit the needs of this project, but the original code can be found in the repository above.
 *  Created by AfonsoCMSousa on 21/05/2026.
 */
 #ifndef FILE_IO_CPP
 #define FILE_IO_CPP
 #include <cstdint>
 #include <fstream>
 #include <sstream>
 #include <string>
 #include <vector>
 // Reads a file and returns a vector of strings, where each string is a line in the file
 std::vector<std::vector<char>> read_char(const char *filePath)
 {
    std::ifstream file(filePath);
    if (!file.is_open())
    {
        throw std::runtime_error("Could not open file");
    }
    std::vector<std::vector<char>> lines;
    std::string line;
    while (std::getline(file, line))
    {
        std::vector<char> charLine(line.begin(), line.end());
        lines.push_back(charLine);
    }
    file.close();
    return lines;
 }
 std::vector<uint64_t> get_digits(uint64_t number) {
    std::vector<uint64_t> _return_vec;
 	while (number != 0) {
 		_return_vec.push_back(number % 10);
 		number /= 10;
 	}
    std::reverse(_return_vec.begin(), _return_vec.end());
 	return _return_vec;
 }
 #endif // FILE_IO_CPP
--- a/include/token.cpp
+++ b/include/token.cpp
@ -0,0 +1,234 @@
 #ifndef TOKEN_CPP
 #define TOKEN_CPP
 #include <cctype>
 #include <cstddef>
 #include <cstdint>
 #include <map>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 typedef u_int64_t VALUE_ID;
 typedef void *DATA;
 enum class TYPE { KEYWORD, IDENTIFIER, NUMBER, STRING, CHAR, OPERATOR, PUNCTUATION, PREPROCESSOR, HEADER, COMMENT, END_OF_FILE, UNKNOWN };
 typedef struct {
 	TYPE type;
 	VALUE_ID value;
 	// INFO: This is only used for debugging, the analizer wont take this into account.
 	// WARN: Removed: "This turns the tokenizer into a C-style unsafe container, not a dataset generator."
 	// u_int64_t data_len;
 	// DATA data;
 } Token;
 std::string type_to_string(TYPE type) {
 	switch (type) {
 	case TYPE::KEYWORD:
 		return "KEYWORD";
 	case TYPE::IDENTIFIER:
 		return "IDENTIFIER";
 	case TYPE::NUMBER:
 		return "NUMBER";
 	case TYPE::STRING:
 		return "STRING\t";
 	case TYPE::CHAR:
 		return "CHAR\t";
 	case TYPE::OPERATOR:
 		return "OPERATOR";
 	case TYPE::PUNCTUATION:
 		return "PUNCTUATION";
 	case TYPE::PREPROCESSOR:
 		return "PREPROCESSOR";
 	case TYPE::HEADER:
 		return "HEADER\t";
 	case TYPE::COMMENT:
 		return "COMMENT\t";
 	case TYPE::END_OF_FILE:
 		return "END_OF_FILE";
 	default:
 		return "UNKNOWN\t";
 	}
 }
 typedef struct {
 	std::unordered_map<std::string, VALUE_ID> value_map;
 	std::unordered_map<std::string, TYPE> type_map;
 	VALUE_ID next_id;
 } Vocabulary;
 std::vector<std::string> lexer(std::vector<std::vector<char>> file_content) {
 	std::vector<std::string> tokens;
 	bool in_string = false;
 	for (const auto &line : file_content) {
 		bool is_preprocessor = false;
 		for (const auto &ch : line) {
 			if (ch == ' ' || ch == '\t')
 				continue;
 			if (ch == '#')
 				is_preprocessor = true;
 			break;
 		}
 		std::string token;
 		for (size_t i = 0; i < line.size(); ++i) {
 			char ch = line[i];
 			if ((ch == '\n' || ch == '\0') || (ch == ' ' && !in_string)) {
 				if (!token.empty()) {
 					tokens.push_back(token);
 					token.clear();
 				}
 			} else {
 				if (ch == '"' || ch == '\'') {
 					in_string = !in_string;
 				}
 				if ((ch == '<' || ch == '>') && !in_string) {
 					if (is_preprocessor) {
 						// For preprocessors, treat < and > as delimiters only (don't push them)
 						if (!token.empty()) {
 							tokens.push_back(token);
 							token.clear();
 						}
 						continue;
 					} else {
 						// For regular code lines, treat them as valid operators
 						char next_ch = (i + 1 < line.size()) ? line[i + 1] : '\0';
 						if (next_ch == '<' || next_ch == '>' || next_ch == '=') {
 							if (!token.empty()) {
 								tokens.push_back(token);
 								token.clear();
 							}
 							tokens.push_back(std::string(1, ch) + std::string(1, next_ch));
 							i++; // Consume the next char
 							continue;
 						} else {
 							if (!token.empty()) {
 								tokens.push_back(token);
 								token.clear();
 							}
 							tokens.push_back(std::string(1, ch));
 							continue;
 						}
 					}
 				}
 				// Added '#' and '!' to punctuation/operator detection
 				if ((ch == '#' || ch == '(' || ch == ')' || ch == ',' || ch == ';' || ch == '=' || ch == '!') && !in_string) {
 					char next_ch = (i + 1 < line.size()) ? line[i + 1] : '\0';
 					if (ch == '=' && next_ch == '=') {
 						if (!token.empty()) {
 							tokens.push_back(token);
 							token.clear();
 						}
 						tokens.push_back("==");
 						i++;
 						continue;
 					} else if (ch == '!' && next_ch == '=') {
 						if (!token.empty()) {
 							tokens.push_back(token);
 							token.clear();
 						}
 						tokens.push_back("!=");
 						i++;
 						continue;
 					}
 					if (!token.empty()) {
 						tokens.push_back(token);
 						token.clear();
 					}
 					tokens.push_back(std::string(1, ch));
 					continue;
 				}
 				token += ch;
 			}
 		}
 		if (!token.empty()) {
 			tokens.push_back(token);
 		}
 	}
 	return tokens;
 }
 std::vector<VALUE_ID> encoder(std::vector<std::string> tokens, Vocabulary &vocab) {
 	std::vector<VALUE_ID> encoded_tokens;
 	for (const auto &token : tokens) {
 		auto it = vocab.value_map.find(token);
 		if (it != vocab.value_map.end()) {
 			encoded_tokens.push_back(it->second);
 		} else {
 			// If the token is not in the vocabulary, we can choose to add it or skip it.
 			// For this implementation, we'll skip it and assign a special ID (e.g., 0) for unknown tokens.
 			encoded_tokens.push_back(0); // 0 for unknown tokens
 		}
 	}
 	return encoded_tokens;
 }
 Vocabulary tokenizer(std::vector<std::string> tokens) {
 	using namespace std;
 	Vocabulary vocab;
 	vocab.next_id = 1; // Start IDs from 1, reserve 0 for unknown/special tokens
 	// 1. Define complete sets for C keywords, operators, and punctuation
 	const unordered_set<string> keywords = {"auto",   "break",  "case",   "char",   "const",   "continue", "default",  "do",       "double",   "else",   "enum",  "extern",
 	                                        "float",  "for",    "goto",   "if",     "inline",  "int",      "long",     "register", "restrict", "return", "short", "signed",
 	                                        "sizeof", "static", "struct", "switch", "typedef", "union",    "unsigned", "void",     "volatile", "while"};
 	const unordered_set<string> preprocessor_words = {"include", "define", "undef", "ifdef", "ifndef", "if", "elif", "else", "endif", "line", "error", "pragma"};
 	const unordered_set<string> operators = {"+", "-", "*",  "/",  "%", "++", "--", "==", "!=", "<",  ">",  "<=", ">=", "&&",  "||",  "!",  "&", "|",
 	                                         "^", "~", "<<", ">>", "=", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "->", "."};
 	const unordered_set<string> punctuation = {"(", ")", "{", "}", "[", "]", ",", ";", ":", "?"};
 	for (const auto &token : tokens) {
 		if (token.empty())
 			continue;
 		// If the token hasn't been registered yet, assign it a unique ID and a type
 		if (vocab.value_map.find(token) == vocab.value_map.end()) {
 			vocab.value_map[token] = vocab.next_id++;
 			// 2. Strict hierarchical classification logic
 			if (token[0] == '/' && token.size() >= 2 && (token[1] == '/' || token[1] == '*')) {
 				vocab.type_map[token] = TYPE::COMMENT;
 			}
 			// Check for HEADER files first before catching generic preprocessor tokens
 			else if (token.size() > 2 && token.find(".h") != string::npos) {
 				vocab.type_map[token] = TYPE::HEADER;
 			} else if (token == "#" || preprocessor_words.count(token) > 0) {
 				// Catches only '#' and 'include', 'define', etc.
 				vocab.type_map[token] = TYPE::PREPROCESSOR;
 			} else if (keywords.count(token) > 0) {
 				// Checked BEFORE identifiers so 'int' isn't labeled as an identifier
 				vocab.type_map[token] = TYPE::KEYWORD;
 			} else if (token[0] == '"') {
 				vocab.type_map[token] = TYPE::STRING;
 			} else if (token[0] == '\'') {
 				vocab.type_map[token] = TYPE::CHAR;
 			} else if (isdigit(token[0]) || (token[0] == '.' && token.size() > 1 && isdigit(token[1]))) {
 				vocab.type_map[token] = TYPE::NUMBER;
 			} else if (operators.count(token) > 0) {
 				vocab.type_map[token] = TYPE::OPERATOR;
 			} else if (punctuation.count(token) > 0) {
 				vocab.type_map[token] = TYPE::PUNCTUATION;
 			} else if (isalpha(token[0]) || token[0] == '_') {
 				vocab.type_map[token] = TYPE::IDENTIFIER;
 			} else {
 				vocab.type_map[token] = TYPE::UNKNOWN;
 			}
 		}
 	}
 	return vocab;
 }
 #endif // TOKEN_CPP
--- a/source/main.cpp
+++ b/source/main.cpp
@ -1,7 +1,91 @@
 #include <iostream>
 #include <vector>
-int main()
+// File I/O functions
-{
+#include "io.cpp"
-    std::cout << "Hello, World!" << std::endl;
+
-    return 0;
+// Tokenizer functions
 #include "token.cpp"
 using namespace std;
 const char *VERSION = "1.0.0";
 char showTokens = 0;
 char FILE_PATH[1024];
 static inline void process_args(int argc, char **args) {
 	for (int i = 0; i < argc; i++) {
 		if (std::string(args[i]) == "--help" || std::string(args[i]) == "-h") {
 			std::cout << "Usage: " << args[0] << " [options]\n";
 			std::cout << "Options:\n";
 			std::cout << "  --help  -h\tShow this help message\n";
 			std::cout << "  --version  -v\tShow version information\n";
 			std::cout << "  --input  -i\tSpecify input file path\n";
 			exit(0);
 		}
 		if (std::string(args[i]) == "--version" || std::string(args[i]) == "-v") {
 			std::cout << "Version " << VERSION << "\n";
 			exit(0);
 		}
 		if (std::string(args[i]) == "--input" || std::string(args[i]) == "-i") {
 			if (i + 1 < argc) {
 				i++;
 				strncpy(FILE_PATH, args[i], sizeof(FILE_PATH) - 1);
 				FILE_PATH[sizeof(FILE_PATH) - 1] = '\0';
 			} else {
 				std::cerr << "Error: --input option requires a file path\n";
 				exit(1);
 			}
 		}
 		if (std::string(args[i]) == "--show-tokens") {
 			showTokens = 1;
 		}
 	}
 	if (FILE_PATH[0] == '\0') {
 		std::cerr << "Error: No input file specified. Use --input <file_path> to specify an input file.\n";
 		exit(1);
 	}
 }
 int main(int argc, char **args) {
 	process_args(argc, args);
 	vector<vector<char>> _file_content;
 	try {
 		_file_content = read_char(FILE_PATH);
 	} catch (const std::exception &e) {
 		std::cerr << "Error: [" << e.what() << "]\n";
 		return 1;
 	}
 	cout << "File read successfully. Total lines: " << _file_content.size() << "\n";
 	cout << "First line (as chars): ";
 	if (!_file_content.empty()) {
 		for (const auto &ch : _file_content[0]) {
 			printf("%c", ch);
 		}
 		cout << "\n";
 	}
 	// NOTE: Tokenizer here
 	vector<string> tokens = lexer(_file_content);
 	Vocabulary vocab = tokenizer(tokens);
 	// NOTE: Pretty print the vocab mapping
 	if (showTokens) {
 		cout << "Vocabulary Mapping:\n";
 		for (const auto &pair : vocab.value_map) {
 			cout << "Token: ID: [";
 			printf("%03llu", pair.second);
 			cout << "] -> " << type_to_string(vocab.type_map.at(pair.first)) << "\tToken: [" << pair.first << "]" << "\n";
 		}
 	}
 	// NOTE: Encode tokens to IDs
 	vector<VALUE_ID> encoded_tokens = encoder(tokens, vocab);
 	return 0;
 }
--- a/BIN
+++ b/BIN
--- a/test.c
+++ b/test.c
@ -0,0 +1,19 @@
 #include <stdio.h>
 #include <stdlib.h>
 int main() {
    printf("Hello, World!\n");
    return 0;
 }
 void simple_test() {
    printf("This is a simple test function.\n");
 }
 int my_int;
 char my_char;
 float my_float;
 int add(int a, int b) {
    return a + b;
 }
		`@ -0,0 +1 @@`
							`/Users/AfonsoCMSosua/Developer/C++/CTokenizerAnalizer/build/compile_commands.json`