Files
nub-lang/compiler/Compiler/Tokenizer.cs
2026-02-08 00:53:55 +01:00

407 lines
11 KiB
C#

using System.Numerics;
using System.Text;
namespace Compiler;
public sealed class Tokenizer(string contents)
{
public static List<Token> Tokenize(string contents)
{
return new Tokenizer(contents).Tokenize();
}
private int index;
private int line = 1;
private int column = 1;
private List<Token> Tokenize()
{
var tokens = new List<Token>();
while (true)
{
if (!TryPeek(out var c))
break;
if (char.IsWhiteSpace(c))
{
Consume();
continue;
}
tokens.Add(ParseToken());
}
return tokens;
}
private Token ParseToken()
{
var startColumn = column;
var c = Peek()!.Value;
if (char.IsDigit(c))
{
switch (c)
{
case '0' when Peek(1) is 'x':
{
Consume();
Consume();
var parsed = BigInteger.Zero;
while (TryPeek(out c))
{
if (c == '_')
{
Consume();
continue;
}
if (!char.IsAsciiHexDigit(c))
break;
parsed <<= 4;
Consume();
parsed += c switch
{
>= '0' and <= '9' => c - '0',
>= 'a' and <= 'f' => c - 'a' + 10,
>= 'A' and <= 'F' => c - 'A' + 10,
_ => 0
};
}
return new TokenIntLiteral(line, startColumn, column - startColumn, parsed);
}
case '0' when Peek(1) is 'b':
{
Consume();
Consume();
var parsed = BigInteger.Zero;
while (TryPeek(out c))
{
if (c == '_')
{
Consume();
continue;
}
if (c is not '0' and not '1')
break;
parsed <<= 1;
if (Consume() == '1')
parsed += BigInteger.One;
}
return new TokenIntLiteral(line, startColumn, column - startColumn, parsed);
}
default:
{
var parsed = BigInteger.Zero;
while (TryPeek(out c))
{
if (c == '_')
{
Consume();
continue;
}
if (!char.IsDigit(c))
break;
parsed *= 10;
parsed += Consume() - '0';
}
return new TokenIntLiteral(line, startColumn, column - startColumn, parsed);
}
}
}
switch (c)
{
case '"':
{
Consume();
var buf = new StringBuilder();
while (TryPeek(out c) && c != '"')
buf.Append(Consume());
Consume();
return new TokenStringLiteral(line, startColumn, column - startColumn, buf.ToString());
}
case '{':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.OpenCurly);
}
case '}':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.CloseCurly);
}
case '(':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.OpenParen);
}
case ')':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.CloseParen);
}
case ',':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Comma);
}
case ':':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Colon);
}
case '^':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Caret);
}
case '!' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.BangEqual);
}
case '!':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Bang);
}
case '=' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.EqualEqual);
}
case '=':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Equal);
}
case '<' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.LessThanEqual);
}
case '<':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.LessThan);
}
case '>' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.GreaterThanEqual);
}
case '>':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.GreaterThan);
}
case '+' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.PlusEqual);
}
case '+':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Plus);
}
case '-' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.MinusEqual);
}
case '-':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Minus);
}
case '*' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.StarEqual);
}
case '*':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Star);
}
case '/' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.ForwardSlashEqual);
}
case '/':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.ForwardSlash);
}
default:
{
if (char.IsLetter(c) || c == '_')
{
var buf = new StringBuilder();
while (TryPeek(out c) && (char.IsLetterOrDigit(c) || c == '_'))
buf.Append(Consume());
var value = buf.ToString();
return value switch
{
"func" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Func),
"let" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Let),
"if" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.If),
"return" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Return),
"true" => new TokenBoolLiteral(line, startColumn, column - startColumn, true),
"false" => new TokenBoolLiteral(line, startColumn, column - startColumn, false),
_ => new TokenIdent(line, startColumn, column - startColumn, value)
};
}
throw new Exception($"Unexpected character '{c}'");
}
}
}
private char Consume()
{
if (index >= contents.Length)
throw new Exception("End of tokens");
var c = contents[index];
if (c == '\n')
{
line += 1;
column = 1;
}
else
{
column += 1;
}
index += 1;
return c;
}
private char? Peek(int offset = 0)
{
if (index + offset >= contents.Length)
return null;
return contents[index + offset];
}
private bool TryPeek(out char c)
{
if (index >= contents.Length)
{
c = '\0';
return false;
}
c = contents[index];
return true;
}
}
public abstract class Token(int line, int column, int length)
{
public int Line = line;
public int Column = column;
public int Length = length;
}
public sealed class TokenIdent(int line, int column, int length, string ident) : Token(line, column, length)
{
public readonly string Ident = ident;
}
public sealed class TokenIntLiteral(int line, int column, int length, BigInteger value) : Token(line, column, length)
{
public BigInteger Value = value;
}
public sealed class TokenStringLiteral(int line, int column, int length, string value) : Token(line, column, length)
{
public readonly string Value = value;
}
public sealed class TokenBoolLiteral(int line, int column, int length, bool value) : Token(line, column, length)
{
public readonly bool Value = value;
}
public enum Symbol
{
OpenCurly,
CloseCurly,
OpenParen,
CloseParen,
Comma,
Colon,
Caret,
Bang,
Equal,
EqualEqual,
BangEqual,
LessThan,
LessThanEqual,
GreaterThan,
GreaterThanEqual,
Plus,
PlusEqual,
Minus,
MinusEqual,
Star,
StarEqual,
ForwardSlash,
ForwardSlashEqual,
}
public sealed class TokenSymbol(int line, int column, int length, Symbol symbol) : Token(line, column, length)
{
public readonly Symbol Symbol = symbol;
}
public enum Keyword
{
Func,
Let,
If,
Return,
}
public sealed class TokenKeyword(int line, int column, int length, Keyword keyword) : Token(line, column, length)
{
public readonly Keyword Keyword = keyword;
}