diff --git a/compile_commands.json b/compile_commands.json new file mode 120000 index 0000000..3396cd2 --- /dev/null +++ b/compile_commands.json @@ -0,0 +1 @@ +/Users/AfonsoCMSosua/Developer/C++/CTokenizerAnalizer/build/compile_commands.json \ No newline at end of file diff --git a/include/io.cpp b/include/io.cpp new file mode 100644 index 0000000..b438198 --- /dev/null +++ b/include/io.cpp @@ -0,0 +1,55 @@ +/* + * io.cpp + * + * - This file contains basic funcions to read files and parse them + * + * - Taken from AfonsoCMSousa (me) advent of code repository: + * https://github.com/AfonsoCMSousa/AdventOfCode + * + * Note: The funtions were changed to better suit the needs of this project, but the original code can be found in the repository above. + * Created by AfonsoCMSousa on 21/05/2026. + */ + +#ifndef FILE_IO_CPP +#define FILE_IO_CPP + +#include +#include +#include +#include +#include + +// Reads a file and returns a vector of strings, where each string is a line in the file +std::vector> read_char(const char *filePath) +{ + std::ifstream file(filePath); + if (!file.is_open()) + { + throw std::runtime_error("Could not open file"); + } + + std::vector> lines; + std::string line; + + while (std::getline(file, line)) + { + std::vector charLine(line.begin(), line.end()); + lines.push_back(charLine); + } + + file.close(); + return lines; +} + +std::vector get_digits(uint64_t number) { + std::vector _return_vec; + while (number != 0) { + _return_vec.push_back(number % 10); + number /= 10; + } + std::reverse(_return_vec.begin(), _return_vec.end()); + + return _return_vec; +} + +#endif // FILE_IO_CPP diff --git a/include/token.cpp b/include/token.cpp new file mode 100644 index 0000000..5ba81c8 --- /dev/null +++ b/include/token.cpp @@ -0,0 +1,234 @@ +#ifndef TOKEN_CPP +#define TOKEN_CPP + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef u_int64_t VALUE_ID; +typedef void *DATA; + +enum class TYPE { KEYWORD, IDENTIFIER, NUMBER, STRING, CHAR, OPERATOR, PUNCTUATION, PREPROCESSOR, HEADER, COMMENT, END_OF_FILE, UNKNOWN }; + +typedef struct { + TYPE type; + VALUE_ID value; + + // INFO: This is only used for debugging, the analizer wont take this into account. + // WARN: Removed: "This turns the tokenizer into a C-style unsafe container, not a dataset generator." + // u_int64_t data_len; + // DATA data; +} Token; + +std::string type_to_string(TYPE type) { + switch (type) { + case TYPE::KEYWORD: + return "KEYWORD"; + case TYPE::IDENTIFIER: + return "IDENTIFIER"; + case TYPE::NUMBER: + return "NUMBER"; + case TYPE::STRING: + return "STRING\t"; + case TYPE::CHAR: + return "CHAR\t"; + case TYPE::OPERATOR: + return "OPERATOR"; + case TYPE::PUNCTUATION: + return "PUNCTUATION"; + case TYPE::PREPROCESSOR: + return "PREPROCESSOR"; + case TYPE::HEADER: + return "HEADER\t"; + case TYPE::COMMENT: + return "COMMENT\t"; + case TYPE::END_OF_FILE: + return "END_OF_FILE"; + default: + return "UNKNOWN\t"; + } +} + +typedef struct { + std::unordered_map value_map; + std::unordered_map type_map; + VALUE_ID next_id; +} Vocabulary; + +std::vector lexer(std::vector> file_content) { + std::vector tokens; + bool in_string = false; + + for (const auto &line : file_content) { + bool is_preprocessor = false; + for (const auto &ch : line) { + if (ch == ' ' || ch == '\t') + continue; + if (ch == '#') + is_preprocessor = true; + break; + } + + std::string token; + for (size_t i = 0; i < line.size(); ++i) { + char ch = line[i]; + + if ((ch == '\n' || ch == '\0') || (ch == ' ' && !in_string)) { + if (!token.empty()) { + tokens.push_back(token); + token.clear(); + } + } else { + if (ch == '"' || ch == '\'') { + in_string = !in_string; + } + + if ((ch == '<' || ch == '>') && !in_string) { + if (is_preprocessor) { + // For preprocessors, treat < and > as delimiters only (don't push them) + if (!token.empty()) { + tokens.push_back(token); + token.clear(); + } + continue; + } else { + // For regular code lines, treat them as valid operators + char next_ch = (i + 1 < line.size()) ? line[i + 1] : '\0'; + if (next_ch == '<' || next_ch == '>' || next_ch == '=') { + if (!token.empty()) { + tokens.push_back(token); + token.clear(); + } + tokens.push_back(std::string(1, ch) + std::string(1, next_ch)); + i++; // Consume the next char + continue; + } else { + if (!token.empty()) { + tokens.push_back(token); + token.clear(); + } + tokens.push_back(std::string(1, ch)); + continue; + } + } + } + + // Added '#' and '!' to punctuation/operator detection + if ((ch == '#' || ch == '(' || ch == ')' || ch == ',' || ch == ';' || ch == '=' || ch == '!') && !in_string) { + char next_ch = (i + 1 < line.size()) ? line[i + 1] : '\0'; + if (ch == '=' && next_ch == '=') { + if (!token.empty()) { + tokens.push_back(token); + token.clear(); + } + tokens.push_back("=="); + i++; + continue; + } else if (ch == '!' && next_ch == '=') { + if (!token.empty()) { + tokens.push_back(token); + token.clear(); + } + tokens.push_back("!="); + i++; + continue; + } + + if (!token.empty()) { + tokens.push_back(token); + token.clear(); + } + tokens.push_back(std::string(1, ch)); + continue; + } + + token += ch; + } + } + if (!token.empty()) { + tokens.push_back(token); + } + } + + return tokens; +} + +std::vector encoder(std::vector tokens, Vocabulary &vocab) { + std::vector encoded_tokens; + for (const auto &token : tokens) { + auto it = vocab.value_map.find(token); + if (it != vocab.value_map.end()) { + encoded_tokens.push_back(it->second); + } else { + // If the token is not in the vocabulary, we can choose to add it or skip it. + // For this implementation, we'll skip it and assign a special ID (e.g., 0) for unknown tokens. + encoded_tokens.push_back(0); // 0 for unknown tokens + } + } + return encoded_tokens; +} + +Vocabulary tokenizer(std::vector tokens) { + using namespace std; + Vocabulary vocab; + vocab.next_id = 1; // Start IDs from 1, reserve 0 for unknown/special tokens + + // 1. Define complete sets for C keywords, operators, and punctuation + const unordered_set keywords = {"auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", + "float", "for", "goto", "if", "inline", "int", "long", "register", "restrict", "return", "short", "signed", + "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while"}; + + const unordered_set preprocessor_words = {"include", "define", "undef", "ifdef", "ifndef", "if", "elif", "else", "endif", "line", "error", "pragma"}; + + const unordered_set operators = {"+", "-", "*", "/", "%", "++", "--", "==", "!=", "<", ">", "<=", ">=", "&&", "||", "!", "&", "|", + "^", "~", "<<", ">>", "=", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "->", "."}; + + const unordered_set punctuation = {"(", ")", "{", "}", "[", "]", ",", ";", ":", "?"}; + + for (const auto &token : tokens) { + if (token.empty()) + continue; + + // If the token hasn't been registered yet, assign it a unique ID and a type + if (vocab.value_map.find(token) == vocab.value_map.end()) { + vocab.value_map[token] = vocab.next_id++; + + // 2. Strict hierarchical classification logic + if (token[0] == '/' && token.size() >= 2 && (token[1] == '/' || token[1] == '*')) { + vocab.type_map[token] = TYPE::COMMENT; + } + // Check for HEADER files first before catching generic preprocessor tokens + else if (token.size() > 2 && token.find(".h") != string::npos) { + vocab.type_map[token] = TYPE::HEADER; + } else if (token == "#" || preprocessor_words.count(token) > 0) { + // Catches only '#' and 'include', 'define', etc. + vocab.type_map[token] = TYPE::PREPROCESSOR; + } else if (keywords.count(token) > 0) { + // Checked BEFORE identifiers so 'int' isn't labeled as an identifier + vocab.type_map[token] = TYPE::KEYWORD; + } else if (token[0] == '"') { + vocab.type_map[token] = TYPE::STRING; + } else if (token[0] == '\'') { + vocab.type_map[token] = TYPE::CHAR; + } else if (isdigit(token[0]) || (token[0] == '.' && token.size() > 1 && isdigit(token[1]))) { + vocab.type_map[token] = TYPE::NUMBER; + } else if (operators.count(token) > 0) { + vocab.type_map[token] = TYPE::OPERATOR; + } else if (punctuation.count(token) > 0) { + vocab.type_map[token] = TYPE::PUNCTUATION; + } else if (isalpha(token[0]) || token[0] == '_') { + vocab.type_map[token] = TYPE::IDENTIFIER; + } else { + vocab.type_map[token] = TYPE::UNKNOWN; + } + } + } + + return vocab; +} +#endif // TOKEN_CPP diff --git a/source/main.cpp b/source/main.cpp index df28b27..f820ee0 100644 --- a/source/main.cpp +++ b/source/main.cpp @@ -1,7 +1,91 @@ #include +#include -int main() -{ - std::cout << "Hello, World!" << std::endl; - return 0; -} \ No newline at end of file +// File I/O functions +#include "io.cpp" + +// Tokenizer functions +#include "token.cpp" + +using namespace std; + +const char *VERSION = "1.0.0"; + +char showTokens = 0; +char FILE_PATH[1024]; + +static inline void process_args(int argc, char **args) { + for (int i = 0; i < argc; i++) { + if (std::string(args[i]) == "--help" || std::string(args[i]) == "-h") { + std::cout << "Usage: " << args[0] << " [options]\n"; + std::cout << "Options:\n"; + std::cout << " --help -h\tShow this help message\n"; + std::cout << " --version -v\tShow version information\n"; + std::cout << " --input -i\tSpecify input file path\n"; + exit(0); + } + + if (std::string(args[i]) == "--version" || std::string(args[i]) == "-v") { + std::cout << "Version " << VERSION << "\n"; + exit(0); + } + + if (std::string(args[i]) == "--input" || std::string(args[i]) == "-i") { + if (i + 1 < argc) { + i++; + strncpy(FILE_PATH, args[i], sizeof(FILE_PATH) - 1); + FILE_PATH[sizeof(FILE_PATH) - 1] = '\0'; + } else { + std::cerr << "Error: --input option requires a file path\n"; + exit(1); + } + } + + if (std::string(args[i]) == "--show-tokens") { + showTokens = 1; + } + } + + if (FILE_PATH[0] == '\0') { + std::cerr << "Error: No input file specified. Use --input to specify an input file.\n"; + exit(1); + } +} + +int main(int argc, char **args) { + process_args(argc, args); + vector> _file_content; + + try { + _file_content = read_char(FILE_PATH); + } catch (const std::exception &e) { + std::cerr << "Error: [" << e.what() << "]\n"; + return 1; + } + cout << "File read successfully. Total lines: " << _file_content.size() << "\n"; + cout << "First line (as chars): "; + if (!_file_content.empty()) { + for (const auto &ch : _file_content[0]) { + printf("%c", ch); + } + cout << "\n"; + } + + // NOTE: Tokenizer here + vector tokens = lexer(_file_content); + Vocabulary vocab = tokenizer(tokens); + + // NOTE: Pretty print the vocab mapping + if (showTokens) { + cout << "Vocabulary Mapping:\n"; + for (const auto &pair : vocab.value_map) { + cout << "Token: ID: ["; + printf("%03llu", pair.second); + cout << "] -> " << type_to_string(vocab.type_map.at(pair.first)) << "\tToken: [" << pair.first << "]" << "\n"; + } + } + + // NOTE: Encode tokens to IDs + vector encoded_tokens = encoder(tokens, vocab); + return 0; +} diff --git a/test b/test new file mode 100755 index 0000000..b6171b7 Binary files /dev/null and b/test differ diff --git a/test.c b/test.c new file mode 100644 index 0000000..f6686a7 --- /dev/null +++ b/test.c @@ -0,0 +1,19 @@ +#include +#include + +int main() { + printf("Hello, World!\n"); + return 0; +} + +void simple_test() { + printf("This is a simple test function.\n"); +} + +int my_int; +char my_char; +float my_float; + +int add(int a, int b) { + return a + b; +}