diff --git a/compile_commands.json b/compile_commands.json
new file mode 120000
index 0000000..3396cd2
--- /dev/null
+++ b/compile_commands.json
@@ -0,0 +1 @@
+/Users/AfonsoCMSosua/Developer/C++/CTokenizerAnalizer/build/compile_commands.json
\ No newline at end of file
diff --git a/include/io.cpp b/include/io.cpp
new file mode 100644
index 0000000..b438198
--- /dev/null
+++ b/include/io.cpp
@@ -0,0 +1,55 @@
+/*
+ *  io.cpp
+ *
+ *  - This file contains basic funcions to read files and parse them
+ *
+ *  - Taken from AfonsoCMSousa (me) advent of code repository:
+ *  https://github.com/AfonsoCMSousa/AdventOfCode
+ *
+ *  Note: The funtions were changed to better suit the needs of this project, but the original code can be found in the repository above.
+ *  Created by AfonsoCMSousa on 21/05/2026.
+ */
+
+#ifndef FILE_IO_CPP
+#define FILE_IO_CPP
+
+#include <cstdint>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+// Reads a file and returns a vector of strings, where each string is a line in the file
+std::vector<std::vector<char>> read_char(const char *filePath)
+{
+    std::ifstream file(filePath);
+    if (!file.is_open())
+    {
+        throw std::runtime_error("Could not open file");
+    }
+
+    std::vector<std::vector<char>> lines;
+    std::string line;
+
+    while (std::getline(file, line))
+    {
+        std::vector<char> charLine(line.begin(), line.end());
+        lines.push_back(charLine);
+    }
+
+    file.close();
+    return lines;
+}
+
+std::vector<uint64_t> get_digits(uint64_t number) {
+    std::vector<uint64_t> _return_vec;
+	while (number != 0) {
+		_return_vec.push_back(number % 10);
+		number /= 10;
+	}
+    std::reverse(_return_vec.begin(), _return_vec.end());
+
+	return _return_vec;
+}
+
+#endif // FILE_IO_CPP
diff --git a/include/token.cpp b/include/token.cpp
new file mode 100644
index 0000000..5ba81c8
--- /dev/null
+++ b/include/token.cpp
@@ -0,0 +1,234 @@
+#ifndef TOKEN_CPP
+#define TOKEN_CPP
+
+#include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+typedef u_int64_t VALUE_ID;
+typedef void *DATA;
+
+enum class TYPE { KEYWORD, IDENTIFIER, NUMBER, STRING, CHAR, OPERATOR, PUNCTUATION, PREPROCESSOR, HEADER, COMMENT, END_OF_FILE, UNKNOWN };
+
+typedef struct {
+	TYPE type;
+	VALUE_ID value;
+
+	// INFO: This is only used for debugging, the analizer wont take this into account.
+	// WARN: Removed: "This turns the tokenizer into a C-style unsafe container, not a dataset generator."
+	// u_int64_t data_len;
+	// DATA data;
+} Token;
+
+std::string type_to_string(TYPE type) {
+	switch (type) {
+	case TYPE::KEYWORD:
+		return "KEYWORD";
+	case TYPE::IDENTIFIER:
+		return "IDENTIFIER";
+	case TYPE::NUMBER:
+		return "NUMBER";
+	case TYPE::STRING:
+		return "STRING\t";
+	case TYPE::CHAR:
+		return "CHAR\t";
+	case TYPE::OPERATOR:
+		return "OPERATOR";
+	case TYPE::PUNCTUATION:
+		return "PUNCTUATION";
+	case TYPE::PREPROCESSOR:
+		return "PREPROCESSOR";
+	case TYPE::HEADER:
+		return "HEADER\t";
+	case TYPE::COMMENT:
+		return "COMMENT\t";
+	case TYPE::END_OF_FILE:
+		return "END_OF_FILE";
+	default:
+		return "UNKNOWN\t";
+	}
+}
+
+typedef struct {
+	std::unordered_map<std::string, VALUE_ID> value_map;
+	std::unordered_map<std::string, TYPE> type_map;
+	VALUE_ID next_id;
+} Vocabulary;
+
+std::vector<std::string> lexer(std::vector<std::vector<char>> file_content) {
+	std::vector<std::string> tokens;
+	bool in_string = false;
+
+	for (const auto &line : file_content) {
+		bool is_preprocessor = false;
+		for (const auto &ch : line) {
+			if (ch == ' ' || ch == '\t')
+				continue;
+			if (ch == '#')
+				is_preprocessor = true;
+			break;
+		}
+
+		std::string token;
+		for (size_t i = 0; i < line.size(); ++i) {
+			char ch = line[i];
+
+			if ((ch == '\n' || ch == '\0') || (ch == ' ' && !in_string)) {
+				if (!token.empty()) {
+					tokens.push_back(token);
+					token.clear();
+				}
+			} else {
+				if (ch == '"' || ch == '\'') {
+					in_string = !in_string;
+				}
+
+				if ((ch == '<' || ch == '>') && !in_string) {
+					if (is_preprocessor) {
+						// For preprocessors, treat < and > as delimiters only (don't push them)
+						if (!token.empty()) {
+							tokens.push_back(token);
+							token.clear();
+						}
+						continue;
+					} else {
+						// For regular code lines, treat them as valid operators
+						char next_ch = (i + 1 < line.size()) ? line[i + 1] : '\0';
+						if (next_ch == '<' || next_ch == '>' || next_ch == '=') {
+							if (!token.empty()) {
+								tokens.push_back(token);
+								token.clear();
+							}
+							tokens.push_back(std::string(1, ch) + std::string(1, next_ch));
+							i++; // Consume the next char
+							continue;
+						} else {
+							if (!token.empty()) {
+								tokens.push_back(token);
+								token.clear();
+							}
+							tokens.push_back(std::string(1, ch));
+							continue;
+						}
+					}
+				}
+
+				// Added '#' and '!' to punctuation/operator detection
+				if ((ch == '#' || ch == '(' || ch == ')' || ch == ',' || ch == ';' || ch == '=' || ch == '!') && !in_string) {
+					char next_ch = (i + 1 < line.size()) ? line[i + 1] : '\0';
+					if (ch == '=' && next_ch == '=') {
+						if (!token.empty()) {
+							tokens.push_back(token);
+							token.clear();
+						}
+						tokens.push_back("==");
+						i++;
+						continue;
+					} else if (ch == '!' && next_ch == '=') {
+						if (!token.empty()) {
+							tokens.push_back(token);
+							token.clear();
+						}
+						tokens.push_back("!=");
+						i++;
+						continue;
+					}
+
+					if (!token.empty()) {
+						tokens.push_back(token);
+						token.clear();
+					}
+					tokens.push_back(std::string(1, ch));
+					continue;
+				}
+
+				token += ch;
+			}
+		}
+		if (!token.empty()) {
+			tokens.push_back(token);
+		}
+	}
+
+	return tokens;
+}
+
+std::vector<VALUE_ID> encoder(std::vector<std::string> tokens, Vocabulary &vocab) {
+	std::vector<VALUE_ID> encoded_tokens;
+	for (const auto &token : tokens) {
+		auto it = vocab.value_map.find(token);
+		if (it != vocab.value_map.end()) {
+			encoded_tokens.push_back(it->second);
+		} else {
+			// If the token is not in the vocabulary, we can choose to add it or skip it.
+			// For this implementation, we'll skip it and assign a special ID (e.g., 0) for unknown tokens.
+			encoded_tokens.push_back(0); // 0 for unknown tokens
+		}
+	}
+	return encoded_tokens;
+}
+
+Vocabulary tokenizer(std::vector<std::string> tokens) {
+	using namespace std;
+	Vocabulary vocab;
+	vocab.next_id = 1; // Start IDs from 1, reserve 0 for unknown/special tokens
+
+	// 1. Define complete sets for C keywords, operators, and punctuation
+	const unordered_set<string> keywords = {"auto",   "break",  "case",   "char",   "const",   "continue", "default",  "do",       "double",   "else",   "enum",  "extern",
+	                                        "float",  "for",    "goto",   "if",     "inline",  "int",      "long",     "register", "restrict", "return", "short", "signed",
+	                                        "sizeof", "static", "struct", "switch", "typedef", "union",    "unsigned", "void",     "volatile", "while"};
+
+	const unordered_set<string> preprocessor_words = {"include", "define", "undef", "ifdef", "ifndef", "if", "elif", "else", "endif", "line", "error", "pragma"};
+
+	const unordered_set<string> operators = {"+", "-", "*",  "/",  "%", "++", "--", "==", "!=", "<",  ">",  "<=", ">=", "&&",  "||",  "!",  "&", "|",
+	                                         "^", "~", "<<", ">>", "=", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "->", "."};
+
+	const unordered_set<string> punctuation = {"(", ")", "{", "}", "[", "]", ",", ";", ":", "?"};
+
+	for (const auto &token : tokens) {
+		if (token.empty())
+			continue;
+
+		// If the token hasn't been registered yet, assign it a unique ID and a type
+		if (vocab.value_map.find(token) == vocab.value_map.end()) {
+			vocab.value_map[token] = vocab.next_id++;
+
+			// 2. Strict hierarchical classification logic
+			if (token[0] == '/' && token.size() >= 2 && (token[1] == '/' || token[1] == '*')) {
+				vocab.type_map[token] = TYPE::COMMENT;
+			}
+			// Check for HEADER files first before catching generic preprocessor tokens
+			else if (token.size() > 2 && token.find(".h") != string::npos) {
+				vocab.type_map[token] = TYPE::HEADER;
+			} else if (token == "#" || preprocessor_words.count(token) > 0) {
+				// Catches only '#' and 'include', 'define', etc.
+				vocab.type_map[token] = TYPE::PREPROCESSOR;
+			} else if (keywords.count(token) > 0) {
+				// Checked BEFORE identifiers so 'int' isn't labeled as an identifier
+				vocab.type_map[token] = TYPE::KEYWORD;
+			} else if (token[0] == '"') {
+				vocab.type_map[token] = TYPE::STRING;
+			} else if (token[0] == '\'') {
+				vocab.type_map[token] = TYPE::CHAR;
+			} else if (isdigit(token[0]) || (token[0] == '.' && token.size() > 1 && isdigit(token[1]))) {
+				vocab.type_map[token] = TYPE::NUMBER;
+			} else if (operators.count(token) > 0) {
+				vocab.type_map[token] = TYPE::OPERATOR;
+			} else if (punctuation.count(token) > 0) {
+				vocab.type_map[token] = TYPE::PUNCTUATION;
+			} else if (isalpha(token[0]) || token[0] == '_') {
+				vocab.type_map[token] = TYPE::IDENTIFIER;
+			} else {
+				vocab.type_map[token] = TYPE::UNKNOWN;
+			}
+		}
+	}
+
+	return vocab;
+}
+#endif // TOKEN_CPP
diff --git a/source/main.cpp b/source/main.cpp
index df28b27..f820ee0 100644
--- a/source/main.cpp
+++ b/source/main.cpp
@@ -1,7 +1,91 @@
 #include <iostream>
+#include <vector>
 
-int main()
-{
-    std::cout << "Hello, World!" << std::endl;
-    return 0;
-}
\ No newline at end of file
+// File I/O functions
+#include "io.cpp"
+
+// Tokenizer functions
+#include "token.cpp"
+
+using namespace std;
+
+const char *VERSION = "1.0.0";
+
+char showTokens = 0;
+char FILE_PATH[1024];
+
+static inline void process_args(int argc, char **args) {
+	for (int i = 0; i < argc; i++) {
+		if (std::string(args[i]) == "--help" || std::string(args[i]) == "-h") {
+			std::cout << "Usage: " << args[0] << " [options]\n";
+			std::cout << "Options:\n";
+			std::cout << "  --help  -h\tShow this help message\n";
+			std::cout << "  --version  -v\tShow version information\n";
+			std::cout << "  --input  -i\tSpecify input file path\n";
+			exit(0);
+		}
+
+		if (std::string(args[i]) == "--version" || std::string(args[i]) == "-v") {
+			std::cout << "Version " << VERSION << "\n";
+			exit(0);
+		}
+
+		if (std::string(args[i]) == "--input" || std::string(args[i]) == "-i") {
+			if (i + 1 < argc) {
+				i++;
+				strncpy(FILE_PATH, args[i], sizeof(FILE_PATH) - 1);
+				FILE_PATH[sizeof(FILE_PATH) - 1] = '\0';
+			} else {
+				std::cerr << "Error: --input option requires a file path\n";
+				exit(1);
+			}
+		}
+
+		if (std::string(args[i]) == "--show-tokens") {
+			showTokens = 1;
+		}
+	}
+
+	if (FILE_PATH[0] == '\0') {
+		std::cerr << "Error: No input file specified. Use --input <file_path> to specify an input file.\n";
+		exit(1);
+	}
+}
+
+int main(int argc, char **args) {
+	process_args(argc, args);
+	vector<vector<char>> _file_content;
+
+	try {
+		_file_content = read_char(FILE_PATH);
+	} catch (const std::exception &e) {
+		std::cerr << "Error: [" << e.what() << "]\n";
+		return 1;
+	}
+	cout << "File read successfully. Total lines: " << _file_content.size() << "\n";
+	cout << "First line (as chars): ";
+	if (!_file_content.empty()) {
+		for (const auto &ch : _file_content[0]) {
+			printf("%c", ch);
+		}
+		cout << "\n";
+	}
+
+	// NOTE: Tokenizer here
+	vector<string> tokens = lexer(_file_content);
+	Vocabulary vocab = tokenizer(tokens);
+
+	// NOTE: Pretty print the vocab mapping
+	if (showTokens) {
+		cout << "Vocabulary Mapping:\n";
+		for (const auto &pair : vocab.value_map) {
+			cout << "Token: ID: [";
+			printf("%03llu", pair.second);
+			cout << "] -> " << type_to_string(vocab.type_map.at(pair.first)) << "\tToken: [" << pair.first << "]" << "\n";
+		}
+	}
+
+	// NOTE: Encode tokens to IDs
+	vector<VALUE_ID> encoded_tokens = encoder(tokens, vocab);
+	return 0;
+}
diff --git a/test b/test
new file mode 100755
index 0000000..b6171b7
Binary files /dev/null and b/test differ
diff --git a/test.c b/test.c
new file mode 100644
index 0000000..f6686a7
--- /dev/null
+++ b/test.c
@@ -0,0 +1,19 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+int main() {
+    printf("Hello, World!\n");
+    return 0;
+}
+
+void simple_test() {
+    printf("This is a simple test function.\n");
+}
+
+int my_int;
+char my_char;
+float my_float;
+
+int add(int a, int b) {
+    return a + b;
+}