232 lines
5.6 KiB
C++
232 lines
5.6 KiB
C++
#pragma once
|
|
#include <cstring>
|
|
#include <cctype>
|
|
|
|
#include "prelude/string.hpp"
|
|
|
|
enum class TokenType
|
|
{
|
|
Eof = 256,
|
|
Id,
|
|
IntLiteral,
|
|
|
|
Extern,
|
|
Fn,
|
|
Local,
|
|
|
|
Unknown,
|
|
};
|
|
|
|
template <typename... Ts>
|
|
inline bool is_one_of(TokenType t, Ts... ts) {
|
|
return ((t == ts) || ...);
|
|
}
|
|
|
|
// operator== TokenType == char
|
|
inline bool operator==(TokenType t, char c)
|
|
{
|
|
return static_cast<char>(t) == c;
|
|
}
|
|
|
|
// operator== char == TokenType
|
|
inline bool operator==(char c, TokenType t)
|
|
{
|
|
return t == c; // reuse the function above
|
|
}
|
|
|
|
// operator!=
|
|
inline bool operator!=(TokenType t, char c)
|
|
{
|
|
return !(t == c);
|
|
}
|
|
|
|
inline bool operator!=(char c, TokenType t)
|
|
{
|
|
return !(t == c);
|
|
}
|
|
|
|
struct Token
|
|
{
|
|
TokenType token;
|
|
long int_number;
|
|
// null-terminated
|
|
StringView string;
|
|
long line_number;
|
|
long offset_start;
|
|
long offset_end;
|
|
public:
|
|
Token(TokenType t) : token(t) {}
|
|
Token(TokenType t, long lnumber, long soffset, long eoffset)
|
|
: token(t), line_number(lnumber), offset_start(soffset), offset_end(eoffset) {}
|
|
Token() : token(TokenType::Unknown) {}
|
|
};
|
|
|
|
class Lexer
|
|
{
|
|
public:
|
|
const Token& token() const { return m_token; }
|
|
const Token* seek_token()
|
|
{
|
|
auto s = m_token;
|
|
auto p = m_pos;
|
|
auto l = m_line;
|
|
auto lnl = m_last_newline;
|
|
if (!NextToken())
|
|
{
|
|
return new Token(TokenType::Eof, m_line, m_pos - m_last_newline, m_pos - m_last_newline);
|
|
}
|
|
auto seeked = m_token;
|
|
m_token = s;
|
|
m_pos = p;
|
|
m_line = l;
|
|
m_last_newline = lnl;
|
|
return new Token(seeked);
|
|
}
|
|
|
|
const char* filename() const { return m_filename; }
|
|
public:
|
|
Lexer(char* filename, StringView code)
|
|
: m_filename(filename), m_code(code) {}
|
|
|
|
Lexer(const Lexer&) = delete;
|
|
public:
|
|
bool NextToken()
|
|
{
|
|
auto len = m_code.len();
|
|
|
|
auto peek = [&]() -> char {
|
|
return (m_pos < len) ? m_code.data[m_pos] : '\0';
|
|
};
|
|
|
|
auto advance = [&]() -> char {
|
|
return (m_pos < len) ? m_code.data[m_pos++] : '\0';
|
|
};
|
|
|
|
// IMPORTANT: >= not >
|
|
if (m_pos >= len)
|
|
{
|
|
m_token = Token(TokenType::Eof);
|
|
return false;
|
|
}
|
|
|
|
char c = advance();
|
|
|
|
// skip whitespace safely
|
|
while (c != '\0' && std::isspace((unsigned char)c))
|
|
{
|
|
if (c == '\n')
|
|
{
|
|
m_line++;
|
|
m_last_newline = m_pos;
|
|
}
|
|
|
|
if (m_pos >= len) // reached real EOF while skipping whitespace
|
|
{
|
|
m_token = Token(TokenType::Eof);
|
|
return false;
|
|
}
|
|
|
|
c = advance();
|
|
}
|
|
|
|
if (c == '\0' || m_pos > len) // paranoia guard
|
|
{
|
|
m_token = Token(TokenType::Eof);
|
|
return false;
|
|
}
|
|
|
|
// identifier
|
|
if (std::isalpha((unsigned char)c) || c == '_')
|
|
{
|
|
StringBuilder s;
|
|
long offset_start = m_pos - m_last_newline - 1; // -1 because we already consumed c
|
|
s.Push(c);
|
|
|
|
// NOTE: usually identifiers allow digits after first char; add isdigit if you want
|
|
while (true)
|
|
{
|
|
char p = peek();
|
|
if (!(std::isalpha((unsigned char)p) || p == '_'))
|
|
break;
|
|
s.Push(advance());
|
|
}
|
|
|
|
s.Push('\0');
|
|
m_token = Token(TokenType::Id, m_line, offset_start, m_pos - m_last_newline);
|
|
m_token.string = s.view();
|
|
|
|
if (strcmp("extern", m_token.string.c_str()) == 0) m_token.token = TokenType::Extern;
|
|
else if (strcmp("fn", m_token.string.c_str()) == 0) m_token.token = TokenType::Fn;
|
|
else if (strcmp("local", m_token.string.c_str()) == 0) m_token.token = TokenType::Local;
|
|
|
|
return true;
|
|
}
|
|
|
|
// integer (hex supported)
|
|
if (std::isdigit((unsigned char)c))
|
|
{
|
|
StringBuilder s;
|
|
long offset_start = m_pos - m_last_newline - 1;
|
|
|
|
bool hex = (c == '0' && peek() == 'x');
|
|
s.Push(c);
|
|
|
|
if (hex) s.Push(advance()); // consume 'x'
|
|
|
|
while (true)
|
|
{
|
|
char p = peek();
|
|
if (std::isdigit((unsigned char)p) ||
|
|
(hex && std::isxdigit((unsigned char)p)))
|
|
{
|
|
s.Push(advance());
|
|
}
|
|
else break;
|
|
}
|
|
|
|
s.Push('\0');
|
|
m_token = Token(TokenType::IntLiteral, m_line, offset_start, m_pos - m_last_newline);
|
|
m_token.int_number = std::strtol(s.data, nullptr, hex ? 16 : 10);
|
|
m_token.string = s.data;
|
|
return true;
|
|
}
|
|
|
|
// single-char token fallback
|
|
m_token = Token((TokenType)c, m_line, m_pos - m_last_newline - 1, m_pos - m_last_newline);
|
|
return true;
|
|
}
|
|
|
|
void NextExpect(TokenType expected)
|
|
{
|
|
if (!NextToken())
|
|
{
|
|
fprintf(stderr, "%s:%d:%d: ERROR: expected %ld, but got EOF\n", m_filename, 0, 0, expected);
|
|
Exit(1);
|
|
}
|
|
if (token().token != expected)
|
|
{
|
|
fprintf(stderr, "%s:%d:%d: ERROR: expected %ld, but got %ld\n", m_filename, token().line_number, token().offset_start, expected, token().token);
|
|
Exit(1);
|
|
}
|
|
}
|
|
|
|
void NextExpect(char expected)
|
|
{
|
|
NextExpect((TokenType)expected);
|
|
}
|
|
private:
|
|
void Exit(int status)
|
|
{
|
|
std::exit(status);
|
|
}
|
|
private:
|
|
StringView m_code;
|
|
char* m_filename;
|
|
|
|
Token m_token;
|
|
|
|
size_t m_pos = 0;
|
|
size_t m_line = 1;
|
|
size_t m_last_newline = 0;
|
|
};
|