Files
nub-lang/compiler/Tokenizer.cs
nub31 6ae10d5f90 ...
2026-02-10 19:50:55 +01:00

597 lines
18 KiB
C#

using System.Numerics;
using System.Text;
namespace Compiler;
public sealed class Tokenizer(string fileName, string contents)
{
public static List<Token>? Tokenize(string fileName, string contents, out List<Diagnostic> diagnostics)
{
return new Tokenizer(fileName, contents).Tokenize(out diagnostics);
}
private int index;
private int line = 1;
private int column = 1;
private List<Token>? Tokenize(out List<Diagnostic> diagnostics)
{
var tokens = new List<Token>();
diagnostics = [];
while (true)
{
try
{
if (!TryPeek(out var c))
break;
if (char.IsWhiteSpace(c))
{
Consume();
continue;
}
if (c == '/' && Peek(1) == '/')
{
Consume();
Consume();
while (TryPeek(out c) && c != '\n')
Consume();
Consume();
continue;
}
tokens.Add(ParseToken());
}
catch (CompileException e)
{
diagnostics.Add(e.Diagnostic);
// Skip current token if parsing failed, this prevents an infinite loop when ParseToken fails before consuming any tokens
TryConsume(out _);
}
}
if (diagnostics.Any(x => x.Severity == DiagnosticSeverity.Error))
return null;
return tokens;
}
private Token ParseToken()
{
var startColumn = column;
var c = Peek()!.Value;
if (char.IsDigit(c))
{
switch (c)
{
case '0' when Peek(1) is 'x':
{
Consume();
Consume();
var parsed = BigInteger.Zero;
var seenDigit = false;
while (TryPeek(out c))
{
if (c == '_')
{
Consume();
continue;
}
if (!char.IsAsciiHexDigit(c))
break;
seenDigit = true;
parsed <<= 4;
Consume();
parsed += c switch
{
>= '0' and <= '9' => c - '0',
>= 'a' and <= 'f' => c - 'a' + 10,
>= 'A' and <= 'F' => c - 'A' + 10,
_ => 0
};
}
if (!seenDigit)
throw new CompileException(Diagnostic.Error("Expected hexadecimal digits after 0x").At(fileName, line, startColumn, column - startColumn).Build());
return new TokenIntLiteral(line, startColumn, column - startColumn, parsed);
}
case '0' when Peek(1) is 'b':
{
Consume();
Consume();
var parsed = BigInteger.Zero;
var seenDigit = false;
while (TryPeek(out c))
{
if (c == '_')
{
Consume();
continue;
}
if (c is not '0' and not '1')
break;
seenDigit = true;
parsed <<= 1;
if (Consume() == '1')
parsed += BigInteger.One;
}
if (!seenDigit)
throw new CompileException(Diagnostic.Error("Expected binary digits after 0b").At(fileName, line, startColumn, column - startColumn).Build());
return new TokenIntLiteral(line, startColumn, column - startColumn, parsed);
}
default:
{
var parsed = BigInteger.Zero;
while (TryPeek(out c))
{
if (c == '_')
{
Consume();
continue;
}
if (!char.IsDigit(c))
break;
parsed *= 10;
parsed += Consume() - '0';
}
return new TokenIntLiteral(line, startColumn, column - startColumn, parsed);
}
}
}
switch (c)
{
case '"':
{
Consume();
var buf = new StringBuilder();
while (true)
{
if (!TryPeek(out c))
throw new CompileException(Diagnostic.Error("Unterminated string literal").At(fileName, line, column, 0).Build());
if (c == '"')
break;
if (c == '\n')
throw new CompileException(Diagnostic.Error("Unterminated string literal").At(fileName, line, column, 1).Build());
buf.Append(Consume());
}
Consume();
return new TokenStringLiteral(line, startColumn, column - startColumn, buf.ToString());
}
case '{':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.OpenCurly);
}
case '}':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.CloseCurly);
}
case '(':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.OpenParen);
}
case ')':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.CloseParen);
}
case ',':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Comma);
}
case '.':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Period);
}
case ':' when Peek(1) is ':':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.ColonColon);
}
case ':':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Colon);
}
case '^':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Caret);
}
case '!' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.BangEqual);
}
case '!':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Bang);
}
case '=' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.EqualEqual);
}
case '=':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Equal);
}
case '<' when Peek(1) is '<':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.LessThanLessThan);
}
case '<' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.LessThanEqual);
}
case '<':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.LessThan);
}
case '>' when Peek(1) is '>':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.GreaterThanGreaterThan);
}
case '>' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.GreaterThanEqual);
}
case '>':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.GreaterThan);
}
case '+' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.PlusEqual);
}
case '+':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Plus);
}
case '-' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.MinusEqual);
}
case '-':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Minus);
}
case '*' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.StarEqual);
}
case '*':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Star);
}
case '/' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.ForwardSlashEqual);
}
case '/':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.ForwardSlash);
}
case '%' when Peek(1) is '=':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.PercentEqual);
}
case '%':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Percent);
}
case '&' when Peek(1) is '&':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.AmpersandAmpersand);
}
case '&':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Ampersand);
}
case '|' when Peek(1) is '|':
{
Consume();
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.PipePipe);
}
case '|':
{
Consume();
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Pipe);
}
default:
{
if (char.IsLetter(c) || c == '_')
{
var buf = new StringBuilder();
while (TryPeek(out c) && (char.IsLetterOrDigit(c) || c == '_'))
buf.Append(Consume());
var value = buf.ToString();
return value switch
{
"func" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Func),
"struct" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Struct),
"let" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Let),
"if" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.If),
"else" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Else),
"while" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.While),
"return" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Return),
"module" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Module),
"true" => new TokenBoolLiteral(line, startColumn, column - startColumn, true),
"false" => new TokenBoolLiteral(line, startColumn, column - startColumn, false),
_ => new TokenIdent(line, startColumn, column - startColumn, value)
};
}
throw new CompileException(Diagnostic.Error($"Unexpected character '{c}'").At(fileName, line, column, 1).Build());
}
}
}
private bool TryConsume(out char c)
{
if (index >= contents.Length)
{
c = '\0';
return false;
}
c = contents[index];
if (c == '\n')
{
line += 1;
column = 1;
}
else
{
column += 1;
}
index += 1;
return true;
}
private char Consume()
{
if (!TryConsume(out var c))
throw new CompileException(Diagnostic.Error("Unexpected end of file").At(fileName, line, column, 0).Build());
return c;
}
private char? Peek(int offset = 0)
{
if (index + offset >= contents.Length)
return null;
return contents[index + offset];
}
private bool TryPeek(out char c)
{
if (index >= contents.Length)
{
c = '\0';
return false;
}
c = contents[index];
return true;
}
}
public abstract class Token(int line, int column, int length)
{
public int Line { get; } = line;
public int Column { get; } = column;
public int Length { get; } = length;
}
public sealed class TokenIdent(int line, int column, int length, string ident) : Token(line, column, length)
{
public string Ident { get; } = ident;
}
public sealed class TokenIntLiteral(int line, int column, int length, BigInteger value) : Token(line, column, length)
{
public BigInteger Value { get; } = value;
}
public sealed class TokenStringLiteral(int line, int column, int length, string value) : Token(line, column, length)
{
public string Value { get; } = value;
}
public sealed class TokenBoolLiteral(int line, int column, int length, bool value) : Token(line, column, length)
{
public bool Value { get; } = value;
}
public enum Symbol
{
OpenCurly,
CloseCurly,
OpenParen,
CloseParen,
Comma,
Period,
Colon,
ColonColon,
Caret,
Bang,
Equal,
EqualEqual,
BangEqual,
LessThan,
LessThanLessThan,
LessThanEqual,
GreaterThan,
GreaterThanGreaterThan,
GreaterThanEqual,
Plus,
PlusEqual,
Minus,
MinusEqual,
Star,
StarEqual,
ForwardSlash,
ForwardSlashEqual,
Percent,
PercentEqual,
Ampersand,
AmpersandAmpersand,
Pipe,
PipePipe,
}
public sealed class TokenSymbol(int line, int column, int length, Symbol symbol) : Token(line, column, length)
{
public Symbol Symbol { get; } = symbol;
}
public enum Keyword
{
Func,
Struct,
Let,
If,
Else,
While,
Return,
Module,
}
public sealed class TokenKeyword(int line, int column, int length, Keyword keyword) : Token(line, column, length)
{
public Keyword Keyword { get; } = keyword;
}
public static class TokenExtensions
{
public static string AsString(this Symbol symbol)
{
return symbol switch
{
Symbol.OpenCurly => "{",
Symbol.CloseCurly => "}",
Symbol.OpenParen => "(",
Symbol.CloseParen => ")",
Symbol.Comma => ",",
Symbol.Period => ".",
Symbol.Colon => ":",
Symbol.ColonColon => "::",
Symbol.Caret => "^",
Symbol.Bang => "!",
Symbol.Equal => "=",
Symbol.EqualEqual => "==",
Symbol.BangEqual => "!=",
Symbol.LessThan => "<",
Symbol.LessThanLessThan => "<<",
Symbol.LessThanEqual => "<=",
Symbol.GreaterThan => ">",
Symbol.GreaterThanGreaterThan => ">>",
Symbol.GreaterThanEqual => ">=",
Symbol.Plus => "+",
Symbol.PlusEqual => "+=",
Symbol.Minus => "-",
Symbol.MinusEqual => "-=",
Symbol.Star => "*",
Symbol.StarEqual => "*=",
Symbol.ForwardSlash => "/",
Symbol.ForwardSlashEqual => "/=",
Symbol.Percent => "%",
Symbol.PercentEqual => "%=",
Symbol.Ampersand => "&",
Symbol.AmpersandAmpersand => "&&",
Symbol.Pipe => "|",
Symbol.PipePipe => "||",
_ => throw new ArgumentOutOfRangeException(nameof(symbol), symbol, null)
};
}
public static string AsString(this Keyword symbol)
{
return symbol switch
{
Keyword.Func => "func",
Keyword.Struct => "struct",
Keyword.Let => "let",
Keyword.If => "if",
Keyword.Else => "else",
Keyword.While => "while",
Keyword.Return => "return",
Keyword.Module => "module",
_ => throw new ArgumentOutOfRangeException(nameof(symbol), symbol, null)
};
}
}