From 7febbb80d42e96faefe2465ea8fdc1a823a6c353 Mon Sep 17 00:00:00 2001 From: admin Date: Wed, 26 Nov 2025 22:26:10 +0100 Subject: [PATCH] feat: basic lexer + parser with basic example --- .vscode/settings.json | 5 ++ CMakeLists.txt | 1 + example.rx | 7 ++ include/ast.hpp | 154 ++++++++++++++++++++++++++++++++++++++++++ include/lexer.hpp | 136 +++++++++++++++++++++++++++++++++++++ include/string.hpp | 47 +++++++++++++ src/main.cpp | 49 +++++++++++++- test | Bin 0 -> 15360 bytes test.asm | 16 +++++ test.o | Bin 0 -> 608 bytes 10 files changed, 414 insertions(+), 1 deletion(-) create mode 100644 .vscode/settings.json create mode 100644 example.rx create mode 100644 include/ast.hpp create mode 100644 include/lexer.hpp create mode 100644 include/string.hpp create mode 100755 test create mode 100644 test.asm create mode 100644 test.o diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..a5a2d31 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "files.associations": { + "print": "cpp" + } +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c8690a..9d0e055 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,4 +6,5 @@ set(SOURCES src/main.cpp ) +include_directories(pl ${CMAKE_CURRENT_SOURCE_DIR}/include) add_executable(pl ${SOURCES}) diff --git a/example.rx b/example.rx new file mode 100644 index 0000000..47deb63 --- /dev/null +++ b/example.rx @@ -0,0 +1,7 @@ +extern putchar; + +fn main() { + local a = 34; + local b = 35; + putchar(a + b); +} \ No newline at end of file diff --git a/include/ast.hpp b/include/ast.hpp new file mode 100644 index 0000000..44de678 --- /dev/null +++ b/include/ast.hpp @@ -0,0 +1,154 @@ +#pragma once +#include +#include "lexer.hpp" + +enum NodeType +{ + Expression = 0, + Extern, + FnDecl, + FnCall, + VarDecl, + COUNT_NODES, +}; + +class Node +{ +public: + virtual NodeType GetType() const = 0; + virtual ~Node() {} +}; + +#define NODE_TYPE(x) \ + NodeType GetType() const override { return (x); } + + +class ExpressionNode : public Node +{ +public: + ExpressionNode() + : m_left(nullptr), m_right(nullptr) {} + ~ExpressionNode() override { + delete m_left; + delete m_right; + } + + NODE_TYPE(Expression) +public: + Node* left() const { return m_left; } + Node* right() const { return m_right; } +private: + Node* m_left; + Node* m_right; +}; + +class ExternNode : public Node +{ +public: + // TODO: support multiple extern symbols + ExternNode(char* symbol) + : m_symbol(symbol) {} + ~ExternNode() override { + delete m_symbol; + } + + NODE_TYPE(Extern) +private: + char* m_symbol; +}; + +class FnDeclNode : public Node +{ +public: + // TODO: support parameters + FnDeclNode(char* name, Node* body) + : m_name(name), m_body(body) {} + ~FnDeclNode() override { + delete m_name; + delete m_body; + } + + NODE_TYPE(FnDecl) +private: + char* m_name; + Node* m_body; +}; + +class FnCallNode : public Node +{ +public: + // TODO: support multiple arguments + FnCallNode(char* name, Node* arg) + : m_name(name), m_arg(arg) {} + ~FnCallNode() override { + delete m_name; + delete m_arg; + } + + NODE_TYPE(FnCall) +private: + char* m_name; + Node* m_arg; +}; + +class VarDeclNode : public Node +{ +public: + VarDeclNode(char* name, Node* value) + : m_name(name), m_value(value) {} + ~VarDeclNode() override { + delete m_name; + delete m_value; + } + + NODE_TYPE(VarDecl) +private: + char* m_name; + Node* m_value; +}; + +class AstParser +{ +public: + AstParser(Lexer* lexer) + : m_lexer(lexer) {} +public: + Node* ParseStatement() + { + while (m_lexer->NextToken()) + { + auto token = m_lexer->token(); + switch(token.token) + { + case Id: + { + if (strcmp(token.string, "extern") == 0) + { + // Extern + m_lexer->Eat(Id); + return new ExternNode(m_lexer->token().string); + } + else if (strcmp(token.string, "fn") == 0) + { + // Function Declaration + m_lexer->Eat(Id); + char* name = strdup(m_lexer->token().string); + m_lexer->Eat('('); + // TODO: parse parameters + m_lexer->Eat(')'); + m_lexer->Eat('{'); + // TODO: parse function body + m_lexer->Eat('}'); + return new FnDeclNode(name, nullptr); + } + } + break; + } + } + + // TODO: report parse error + return nullptr; + } +private: + Lexer* m_lexer; +}; diff --git a/include/lexer.hpp b/include/lexer.hpp new file mode 100644 index 0000000..612e955 --- /dev/null +++ b/include/lexer.hpp @@ -0,0 +1,136 @@ +#pragma once +#include +#include + +#include "string.hpp" + +enum TokenType +{ + Eof = 256, + Id, + IntLiteral, + + Unknown, +}; + +struct Token +{ + TokenType token; + long int_number; + // null-terminated + char* string; + long line_number; + long offset_start; + long offset_end; +public: + Token(TokenType t) : token(t) {} + Token(TokenType t, long lnumber, long soffset, long eoffset) + : token(t), line_number(lnumber), offset_start(soffset), offset_end(eoffset) {} + Token() : token(Unknown) {} +}; + +class Lexer +{ +public: + const Token& token() { return m_token; } +public: + Lexer(char* filename, StringView code) + : m_filename(filename), m_code(code) {} + + Lexer(const Lexer&) = delete; + Lexer(Lexer&& other) + { + m_code = other.m_code; + other.m_code = StringView(); + } +public: + bool NextToken() + { + if (m_pos >= m_code.size) + { + m_token = Token(TokenType::Eof); + return false; + } + + char c = m_code.data[m_pos++]; + + while(std::isspace(c)) { + if (c == '\n') + { + m_line++; + m_last_newline = m_pos; + } + c = m_code.data[m_pos++]; + } + + if (std::isalpha(c) != 0) + { + StringBuilder s; + long offset_start = m_pos - m_last_newline; + s.PushChar(c); + // id + while (std::isalpha(m_code.data[m_pos]) != 0) + { + s.PushChar(m_code.data[m_pos++]); + } + s.PushChar('\0'); + m_token = Token(Id, m_line, offset_start, m_pos - m_last_newline); + m_token.string = s.data; + return true; + } + + if (std::isdigit(c) != 0) + { + StringBuilder s; + long offset_start = m_pos - m_last_newline; + bool hex = c == '0' && m_code.data[m_pos] == 'x'; + s.PushChar(c); + // integer (could be hex) + while (std::isdigit(m_code.data[m_pos]) != 0 || (hex && std::isalpha(m_code.data[m_pos]) != 0)) + { + s.PushChar(m_code.data[m_pos++]); + } + s.PushChar('\0'); + m_token = Token(IntLiteral, m_line, offset_start, m_pos - m_last_newline); + m_token.int_number = std::strtol(s.data, nullptr, hex ? 16 : 10); + m_token.string = s.data; + return true; + } + + m_token = Token((TokenType)c, m_line, m_pos - m_last_newline, m_pos - m_last_newline + 1); + return true; + } + + void Eat(TokenType expected) + { + if (!NextToken()) + { + fprintf(stderr, "%s:%d:%d: ERROR: expected %ld, but got EOF", m_filename, 0, 0, expected); + Exit(1); + } + if (token().token != expected) + { + fprintf(stderr, "%s:%d:%d: ERROR: expected %ld, but got %ld", m_filename, token().line_number, token().offset_start, expected); + Exit(1); + } + } + + void Eat(char expected) + { + Eat((TokenType)expected); + } +private: + void Exit(int status) + { + std::exit(status); + } +private: + StringView m_code; + char* m_filename; + + Token m_token; + + size_t m_pos = 0; + size_t m_line = 1; + size_t m_last_newline = 0; +}; \ No newline at end of file diff --git a/include/string.hpp b/include/string.hpp new file mode 100644 index 0000000..7cc7ed6 --- /dev/null +++ b/include/string.hpp @@ -0,0 +1,47 @@ +#pragma once +#include + +struct StringView +{ + size_t size; + const char* data; +public: + StringView() + { + data = nullptr; + size = 0; + } + + StringView(const char* data, size_t size) + { + this->data = data; + this->size = size; + } +}; + +struct StringBuilder +{ + size_t size; + size_t capacity; + char* data; +public: + StringBuilder() + { + size = 0; + capacity = 10; + data = (char*)malloc(capacity * sizeof(char)); + } +private: + void ensureSize(size_t newSize) + { + if (newSize <= capacity) return; + capacity = capacity + (capacity / 2); + data = (char*)realloc(data, capacity * sizeof(char)); + } +public: + void PushChar(char c) + { + ensureSize(size + 1); + data[size++] = c; + } +}; \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index fcf145f..771705c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,8 +1,55 @@ #include +#include #include +#include "lexer.hpp" +#include "ast.hpp" + +void dump_tokens(const char* filename, Lexer* lexer) +{ + while (lexer->NextToken()) { + std::print("{}:{}:{}: ", filename, lexer->token().line_number, lexer->token().offset_start); + if (lexer->token().token == Id) + std::println("id = {}", lexer->token().string); + else if (lexer->token().token == IntLiteral) + std::println("int = {}", lexer->token().int_number); + else + std::println("token = {}", (char)lexer->token().token); + } +} int main(int argc, char** argv) { - std::println("Hello, World!"); + for (int i = 0; i < argc; ++i) { + std::println("arg#{}: {}", i, argv[i]); + } + char* filename; + if (argc > 1) { + filename = (++argv)[0]; + } else { + fprintf(stderr, "ERROR: Input file is required.\n"); + return 1; + } + + std::ifstream f(filename); + if (!f.is_open()) { + fprintf(stderr, "ERROR: Failed to open input file: %s\n", filename); + return 1; + } + + std::string content((std::istreambuf_iterator(f)), std::istreambuf_iterator()); + + f.close(); + + // std::println("{}", content); + + Lexer lexer(filename, StringView(content.c_str(), content.size())); + + // dump_tokens(filename, &lexer); + + AstParser parser(&lexer); + + auto node = parser.ParseStatement(); + ExternNode* extrn = reinterpret_cast(node); + return 0; } diff --git a/test b/test new file mode 100755 index 0000000000000000000000000000000000000000..c35eebf8936f981858b6d562ffd694df12431550 GIT binary patch literal 15360 zcmeHOU2Ggz6~4Q6$d=YWwHoRsX)~%))K>DuKd}`kZr1-}a1xWa5~ym2wRdc9W$&8Z zad9G{S`7%SDo_MMv@ca3kl+ES4+vG_fsI0&wBo@P5;EceQ?+VZ{vx%YmP#4UxpThp z_+keT5(w$u$oISF-1FUg=FFYBvzaqbj7*GXGZ}@_pdM5#ETXCqOVxpt9Tl0Z;=FPCkZPP!+9${>eec5w;qSb z!g$pDmkQgr?O=J+F8a@q-r2!^ZWsOIyXX&){_Y*dQA^9=%zUA$e4mbX#;=8iYUnQ( zN@dc_tQ7p&Qn|2DIu|JJ=40j0b2MpoyS>UEpE}_egK98Ws)a#y>cr4O zr5sEZrWXPlYi_YpmeKv#bVoy_vUE-|`X8U#49WKsVmN+)S&n_$hzhIB??_u z9j7)IT;Eu)owvsN4Y_`y-;@Dc^Kn1)kn1uauUT1Q+VDzBFPEy;uc!1_^W3qK((C)4 zwx;_^O26&8$j1hI3I5%+B|}@V5wH=k5wH=k5wH=k5wH>XKZwA)EqDFRJO78A_tc(? zN0jouS`V|)x_AEPxl7vnsP|P;NB8`K>Xt)!YOy@eeQ$3@QN;YNKaQ`x_Nce^x_ADK zYm-wWUG=V4ybJxmBb#X7!({&6tlM(vD|$TAKTjI(!k%XtKJk9IkEVW*+cfn}(Pqn` zRW`gNNcLaU_PviW+;`nuyXIZ|({b^W$(4?;cYZPlMIlHHfLK74R7(;zt#Q@ z>Q>7=4|(VN-*Ktwt-TpGc~A8>lRnzFL8_}WbTBXPX(UDFkxN_tk{@|ByCw5@Z_kk} z{JVU4ga*9&#`P%jE?@pAw>NaVPEBUj1gvp(gXL{5guat(sd$o7+F#a`TsRtLpgv2k!4VtX~K6yDBE|yC!}Q z%7u0OXN!%1jew1Sjew1Sjew1Sjew1Sjew1Sjle&T0P+rzXNX)r&Sm6sOAJY3W0S~V zkbK2EL`FU%=WlXBp5p#kRLE;YzAgIyay_cB$qSOCh}^>!`C*5g!*%KB7p7dEjzQfl z$iGLq?293(k^6X6@)(gL348orFRa6 z-7crQv%9yetE(H9J0r&-LubvLFFwz$s?pibyHnfWu5dol+e3{u2q_%fMPDSnld?ne zn_Azla(QTCTqS6wv)rDKWz=7zMq9R7;d+4n0yWs)9OqfmJ1|P&52<}?wp}5f==urk zRZBv`|98AZ`c`#^a`Lc@@!JG#sww#%L;qh3>0YHjwm1F%73}4DCQ+B)P|g0B_HU8i z(H7e){v<^IN6mJ%I!OAqEM29>x>5e{=T8ot7$2gPYg#q_i06-<9yl@Ld&8%dKQ?h{ zaA3keH9C4`WXhi!7@QdKkpZ4ZTDQ*SHnYK#UP}7DVWM|XEQAHcY2zhwfrU8h`yW!k z{bH@+&lk!?P7fbHMOMX9*&Mdb%&s{H9%O}LVePATR7$jM>|$7$CJd`F&ZAYXgn>I(UUrwNm8GB>o>gw5nqDr^mjI=ra!X}OZ(mZ{aK2ER zS8nlanFft9tj0Z$2i01sQr;%`)K?7_3QUmJ(n821l4EWdtWd2Z=T<8^j&3mT&sGbI zfj?g)E+lblU~s%6EX>7xzfi3f$YJKAP16(>OEcs{g+`{z)zig8Q+Sm-Q(0UL$`qCV zu<-uG_jK$7_>KhV7~|u94>e~4rTI&8qXr&^p&+hTs0XM)mqarEZr?G%o^8`*oP@v9 z9s&PfkokewUu8Q<_F;^1h8nzQg1;_&;19%*w?Cd6$B+H(VbU=LeC*Re?8{`Li|PLi zX?YI>ANwA#S$OatcEBa-=RFlN_Elh~@J;_kt)8F;pA+!0uLJ91i22RqUlabI*x@+= zh&VBypCwE_-=8@q@_4azUzGbJo}b7>7jyl5gS2VBBfJEK2axXM_WO*%&x--jF-{ox z81FfQk9`k_=R??={(oQi$zJVP(;vV4V zVlk<4{-Ah?8tLnQx9`|sZ_fWq>fkjlmtFE55WX8T=YtIVEph2-AmQV=dZUAsRA3iN zTQc;&rw(3wm>~r`3*WToxzeRc<`?s%3w-nX1wV~gdkL8u v$Kbzo@;c_V1AlN|;olqZotjDUJfjWMqH=Mg}_u1P><4z@UI6=l~XWU|?lnVemM9+!e@u0TKeTxe)9FAfdw? zAQA{zfS4JG|NMsl5s)Adpo>F9F-b9?5Hl8+$pFQLu!t)H#RV{pVPMcJuFNe-Oajs+ zMPRxpH78N8B(_Y|s D