From 13fbdad5639661402bcdc9e0758fe7a68cd5e867 Mon Sep 17 00:00:00 2001 From: admin Date: Fri, 28 Nov 2025 18:32:30 +0100 Subject: [PATCH] feat: basic parsing of example.rx is ready --- .vscode/settings.json | 3 +- example.rx | 8 +- include/ast.hpp | 295 ++++++++++++++++++++++++++++++++++++++---- include/lexer.hpp | 88 +++++++++++-- src/main.cpp | 7 +- 5 files changed, 354 insertions(+), 47 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index a5a2d31..47dba71 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,6 @@ { "files.associations": { - "print": "cpp" + "print": "cpp", + "cctype": "cpp" } } \ No newline at end of file diff --git a/example.rx b/example.rx index 47deb63..6b13e79 100644 --- a/example.rx +++ b/example.rx @@ -1,7 +1,7 @@ -extern putchar; +extern putchar fn main() { - local a = 34; - local b = 35; - putchar(a + b); + local a = 34 + local b = 35 + putchar(a + b) } \ No newline at end of file diff --git a/include/ast.hpp b/include/ast.hpp index 44de678..588748a 100644 --- a/include/ast.hpp +++ b/include/ast.hpp @@ -1,14 +1,21 @@ #pragma once #include +#include +#include #include "lexer.hpp" -enum NodeType +enum class NodeType { Expression = 0, + // TODO: abstract "Literal" node type + IntLiteral, Extern, FnDecl, FnCall, + Variable, VarDecl, + Compound, + Program, COUNT_NODES, }; @@ -20,14 +27,23 @@ public: }; #define NODE_TYPE(x) \ - NodeType GetType() const override { return (x); } + NodeType GetType() const override { return NodeType::x; } class ExpressionNode : public Node { public: - ExpressionNode() - : m_left(nullptr), m_right(nullptr) {} + enum class Operator + { + Plus = 0, + Minus, + Divide, + Multiply, + COUNT_OPERATORS, + }; +public: + ExpressionNode(Node* left, Node* right, Operator op) + : m_left(left), m_right(right), m_op(op) {} ~ExpressionNode() override { delete m_left; delete m_right; @@ -37,9 +53,26 @@ public: public: Node* left() const { return m_left; } Node* right() const { return m_right; } + Operator op() const { return m_op; } private: Node* m_left; Node* m_right; + Operator m_op; +}; + +// TODO: Maybe just LiteralNode with double or int literals support +class IntLiteralNode : public Node +{ +public: + IntLiteralNode(long value) + : m_int_value(value) {} + ~IntLiteralNode() override {} + + NODE_TYPE(IntLiteral) +public: + long integer() const { return m_int_value; } +private: + long m_int_value; }; class ExternNode : public Node @@ -91,6 +124,20 @@ private: Node* m_arg; }; +class VariableNode : public Node +{ +public: + VariableNode(char* name) + : m_name(name) {} + ~VariableNode() override { + delete m_name; + } + + NODE_TYPE(Variable) +private: + char* m_name; +}; + class VarDeclNode : public Node { public: @@ -107,47 +154,239 @@ private: Node* m_value; }; +class CompoundNode : public Node +{ +public: + CompoundNode() = default; + + NODE_TYPE(Compound) + + // --- Iteration --- + auto begin() { return m_nodes.begin(); } + auto end() { return m_nodes.end(); } + auto begin() const { return m_nodes.begin(); } + auto end() const { return m_nodes.end(); } + + // --- Access by index --- + Node* operator[](size_t i) { return m_nodes[i]; } + const Node* operator[](size_t i) const { return m_nodes[i]; } + + Node* at(size_t i) { return m_nodes.at(i); } + const Node* at(size_t i) const { return m_nodes.at(i); } + + // --- Modifiers --- + void addNode(Node* n) { m_nodes.push_back(n); } + void removeNode(size_t idx) { m_nodes.erase(m_nodes.begin() + idx); } + + // If you want full expose for iteration but not modification + const std::vector& nodes() const { return m_nodes; } + + size_t size() const { return m_nodes.size(); } + bool empty() const { return m_nodes.empty(); } +private: + std::vector m_nodes; +}; + +class ProgramNode : public Node +{ +public: + ProgramNode() = default; + + NODE_TYPE(Program) +public: + void PushFunction(FnDeclNode* fn) + { + m_funcs.push_back(fn); + } + + void PushExtern(ExternNode* extrn) + { + m_externs.push_back(extrn); + } +private: + std::vector m_funcs; + std::vector m_externs; +}; + + class AstParser { public: AstParser(Lexer* lexer) : m_lexer(lexer) {} public: + ExternNode* ParseExtern() + { + m_lexer->NextExpect(TokenType::Id); + return new ExternNode(m_lexer->token().string); + } + + FnDeclNode* ParseFnDecl() + { + // Function Declaration + m_lexer->NextExpect(TokenType::Id); + char *name = strdup(m_lexer->token().string); + m_lexer->NextExpect('('); + // TODO: parse parameters + m_lexer->NextExpect(')'); + m_lexer->NextExpect('{'); + auto compound = new CompoundNode(); + while (m_lexer->seek_token()->token != '}') + { + compound->addNode(ParseStatement()); + } + m_lexer->NextExpect('}'); + return new FnDeclNode(name, compound); + } + + FnCallNode* ParseFnCall(char* name) + { + // m_lexer->NextExpect(TokenType::Id); + // char* name = strdup(m_lexer->token().string); + m_lexer->NextExpect('('); + Node* arg = ParseExpression(); + m_lexer->NextExpect(')'); + return new FnCallNode(name, arg); + } + + Node* ParseFactor() + { + auto token = m_lexer->seek_token(); + + switch (token->token) + { + case TokenType::IntLiteral: // integer + { + m_lexer->NextExpect(TokenType::IntLiteral); + auto node = new IntLiteralNode(m_lexer->token().int_number); + return node; + } + case TokenType::Id: // variable name or function call + { + m_lexer->NextExpect(TokenType::Id); + char *name = strdup(m_lexer->token().string); + token = m_lexer->seek_token(); + if (token->token == '(') + { + return ParseFnCall(name); + } + return new VariableNode(name); + } + default: + fprintf(stderr, "%s:%d:%d: ERROR: unexpected token while parsing %ld\n", m_lexer->filename(), token->line_number, token->offset_start, token->token); + Exit(1); + break; + } + + assert(0 && "unreachable"); + } + + Node* ParseTerm() + { + auto t = ParseFactor(); + + for (auto op = m_lexer->seek_token(); is_one_of(op->token, '/', '*'); op = m_lexer->seek_token()) + { + m_lexer->NextToken(); + ExpressionNode::Operator eop; + assert((int)ExpressionNode::Operator::COUNT_OPERATORS == 4 && "some operators may not be handled"); + switch((char)op->token) + { + case '/': + eop = ExpressionNode::Operator::Divide; + break; + case '*': + eop = ExpressionNode::Operator::Multiply; + break; + default: + assert(false && "should be unreachable"); + break; + } + auto expr = new ExpressionNode(t, ParseTerm(), eop); + t = expr; + } + + return t; + } + + Node* ParseExpression() + { + auto t = ParseTerm(); + + for (auto op = m_lexer->seek_token(); is_one_of(op->token, '+', '-'); op = m_lexer->seek_token()) + { + m_lexer->NextToken(); + ExpressionNode::Operator eop; + assert((int)ExpressionNode::Operator::COUNT_OPERATORS == 4 && "some operators may not be handled"); + switch((char)op->token) + { + case '+': + eop = ExpressionNode::Operator::Plus; + break; + case '-': + eop = ExpressionNode::Operator::Minus; + break; + default: + assert(false && "should be unreachable"); + break; + } + auto expr = new ExpressionNode(t, ParseTerm(), eop); + t = expr; + } + + return t; + } + + VarDeclNode* ParseVarDecl() + { + m_lexer->NextExpect(TokenType::Local); + m_lexer->NextExpect(TokenType::Id); + char *name = strdup(m_lexer->token().string); + m_lexer->NextExpect('='); + Node* value = ParseExpression(); + return new VarDeclNode(name, value); + } + Node* ParseStatement() { + auto token = m_lexer->seek_token(); + // TODO: proper error handling + assert(token != nullptr && "next token should be available"); + switch(token->token) + { + case TokenType::Local: return ParseVarDecl(); + default: return ParseExpression(); + } + + assert(0 && "unreachable"); + return nullptr; + } + + ProgramNode* Parse() + { + auto program = new ProgramNode; + while (m_lexer->NextToken()) { auto token = m_lexer->token(); switch(token.token) { - case Id: - { - if (strcmp(token.string, "extern") == 0) - { - // Extern - m_lexer->Eat(Id); - return new ExternNode(m_lexer->token().string); - } - else if (strcmp(token.string, "fn") == 0) - { - // Function Declaration - m_lexer->Eat(Id); - char* name = strdup(m_lexer->token().string); - m_lexer->Eat('('); - // TODO: parse parameters - m_lexer->Eat(')'); - m_lexer->Eat('{'); - // TODO: parse function body - m_lexer->Eat('}'); - return new FnDeclNode(name, nullptr); - } + case TokenType::Extern: program->PushExtern(ParseExtern()); break; + case TokenType::Fn: program->PushFunction(ParseFnDecl()); break; + default: { + fprintf(stderr, "%s:%d:%d: ERROR: unexpected token while parsing %ld\n", m_lexer->filename(), token.line_number, token.offset_start, token.token); + Exit(1); + break; } - break; } } - // TODO: report parse error - return nullptr; + return program; + } +private: + void Exit(int status) + { + std::exit(status); } private: Lexer* m_lexer; diff --git a/include/lexer.hpp b/include/lexer.hpp index 612e955..5c618c5 100644 --- a/include/lexer.hpp +++ b/include/lexer.hpp @@ -1,18 +1,51 @@ #pragma once #include +#include #include #include "string.hpp" -enum TokenType +enum class TokenType { Eof = 256, Id, IntLiteral, + Extern, + Fn, + Local, + Unknown, }; +template +inline bool is_one_of(TokenType t, Ts... ts) { + return ((t == ts) || ...); +} + +// operator== TokenType == char +inline bool operator==(TokenType t, char c) +{ + return static_cast(t) == c; +} + +// operator== char == TokenType +inline bool operator==(char c, TokenType t) +{ + return t == c; // reuse the function above +} + +// operator!= +inline bool operator!=(TokenType t, char c) +{ + return !(t == c); +} + +inline bool operator!=(char c, TokenType t) +{ + return !(t == c); +} + struct Token { TokenType token; @@ -26,13 +59,32 @@ public: Token(TokenType t) : token(t) {} Token(TokenType t, long lnumber, long soffset, long eoffset) : token(t), line_number(lnumber), offset_start(soffset), offset_end(eoffset) {} - Token() : token(Unknown) {} + Token() : token(TokenType::Unknown) {} }; class Lexer { public: - const Token& token() { return m_token; } + const Token& token() const { return m_token; } + const Token* seek_token() + { + auto s = m_token; + auto p = m_pos; + auto l = m_line; + auto lnl = m_last_newline; + if (!NextToken()) + { + return nullptr; + } + auto seeked = m_token; + m_token = s; + m_pos = p; + m_line = l; + m_last_newline = lnl; + return new Token(seeked); + } + + const char* filename() const { return m_filename; } public: Lexer(char* filename, StringView code) : m_filename(filename), m_code(code) {} @@ -74,8 +126,24 @@ public: s.PushChar(m_code.data[m_pos++]); } s.PushChar('\0'); - m_token = Token(Id, m_line, offset_start, m_pos - m_last_newline); + m_token = Token(TokenType::Id, m_line, offset_start, m_pos - m_last_newline); m_token.string = s.data; + + if (strcmp("extern", m_token.string) == 0) + { + m_token.token = TokenType::Extern; + } + + if (strcmp("fn", m_token.string) == 0) + { + m_token.token = TokenType::Fn; + } + + if (strcmp("local", m_token.string) == 0) + { + m_token.token = TokenType::Local; + } + return true; } @@ -91,7 +159,7 @@ public: s.PushChar(m_code.data[m_pos++]); } s.PushChar('\0'); - m_token = Token(IntLiteral, m_line, offset_start, m_pos - m_last_newline); + m_token = Token(TokenType::IntLiteral, m_line, offset_start, m_pos - m_last_newline); m_token.int_number = std::strtol(s.data, nullptr, hex ? 16 : 10); m_token.string = s.data; return true; @@ -101,23 +169,23 @@ public: return true; } - void Eat(TokenType expected) + void NextExpect(TokenType expected) { if (!NextToken()) { - fprintf(stderr, "%s:%d:%d: ERROR: expected %ld, but got EOF", m_filename, 0, 0, expected); + fprintf(stderr, "%s:%d:%d: ERROR: expected %ld, but got EOF\n", m_filename, 0, 0, expected); Exit(1); } if (token().token != expected) { - fprintf(stderr, "%s:%d:%d: ERROR: expected %ld, but got %ld", m_filename, token().line_number, token().offset_start, expected); + fprintf(stderr, "%s:%d:%d: ERROR: expected %ld, but got %ld\n", m_filename, token().line_number, token().offset_start, expected, token().token); Exit(1); } } - void Eat(char expected) + void NextExpect(char expected) { - Eat((TokenType)expected); + NextExpect((TokenType)expected); } private: void Exit(int status) diff --git a/src/main.cpp b/src/main.cpp index 771705c..b8bbeb9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -8,9 +8,9 @@ void dump_tokens(const char* filename, Lexer* lexer) { while (lexer->NextToken()) { std::print("{}:{}:{}: ", filename, lexer->token().line_number, lexer->token().offset_start); - if (lexer->token().token == Id) + if (lexer->token().token == TokenType::Id) std::println("id = {}", lexer->token().string); - else if (lexer->token().token == IntLiteral) + else if (lexer->token().token == TokenType::IntLiteral) std::println("int = {}", lexer->token().int_number); else std::println("token = {}", (char)lexer->token().token); @@ -48,8 +48,7 @@ int main(int argc, char** argv) AstParser parser(&lexer); - auto node = parser.ParseStatement(); - ExternNode* extrn = reinterpret_cast(node); + auto program = parser.Parse(); return 0; }