597 lines
18 KiB
C#
597 lines
18 KiB
C#
using System.Numerics;
|
|
using System.Text;
|
|
|
|
namespace Compiler;
|
|
|
|
public sealed class Tokenizer(string fileName, string contents)
|
|
{
|
|
public static List<Token>? Tokenize(string fileName, string contents, out List<Diagnostic> diagnostics)
|
|
{
|
|
return new Tokenizer(fileName, contents).Tokenize(out diagnostics);
|
|
}
|
|
|
|
private int index;
|
|
private int line = 1;
|
|
private int column = 1;
|
|
|
|
private List<Token>? Tokenize(out List<Diagnostic> diagnostics)
|
|
{
|
|
var tokens = new List<Token>();
|
|
diagnostics = [];
|
|
|
|
while (true)
|
|
{
|
|
try
|
|
{
|
|
if (!TryPeek(out var c))
|
|
break;
|
|
|
|
if (char.IsWhiteSpace(c))
|
|
{
|
|
Consume();
|
|
continue;
|
|
}
|
|
|
|
if (c == '/' && Peek(1) == '/')
|
|
{
|
|
Consume();
|
|
Consume();
|
|
while (TryPeek(out c) && c != '\n')
|
|
Consume();
|
|
|
|
Consume();
|
|
continue;
|
|
}
|
|
|
|
tokens.Add(ParseToken());
|
|
}
|
|
catch (CompileException e)
|
|
{
|
|
diagnostics.Add(e.Diagnostic);
|
|
// Skip current token if parsing failed, this prevents an infinite loop when ParseToken fails before consuming any tokens
|
|
TryConsume(out _);
|
|
}
|
|
}
|
|
|
|
if (diagnostics.Any(x => x.Severity == DiagnosticSeverity.Error))
|
|
return null;
|
|
|
|
return tokens;
|
|
}
|
|
|
|
private Token ParseToken()
|
|
{
|
|
var startColumn = column;
|
|
var c = Peek()!.Value;
|
|
|
|
if (char.IsDigit(c))
|
|
{
|
|
switch (c)
|
|
{
|
|
case '0' when Peek(1) is 'x':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
|
|
var parsed = BigInteger.Zero;
|
|
var seenDigit = false;
|
|
|
|
while (TryPeek(out c))
|
|
{
|
|
if (c == '_')
|
|
{
|
|
Consume();
|
|
continue;
|
|
}
|
|
|
|
if (!char.IsAsciiHexDigit(c))
|
|
break;
|
|
|
|
seenDigit = true;
|
|
parsed <<= 4;
|
|
|
|
Consume();
|
|
parsed += c switch
|
|
{
|
|
>= '0' and <= '9' => c - '0',
|
|
>= 'a' and <= 'f' => c - 'a' + 10,
|
|
>= 'A' and <= 'F' => c - 'A' + 10,
|
|
_ => 0
|
|
};
|
|
}
|
|
|
|
if (!seenDigit)
|
|
throw new CompileException(Diagnostic.Error("Expected hexadecimal digits after 0x").At(fileName, line, startColumn, column - startColumn).Build());
|
|
|
|
return new TokenIntLiteral(line, startColumn, column - startColumn, parsed);
|
|
}
|
|
case '0' when Peek(1) is 'b':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
|
|
var parsed = BigInteger.Zero;
|
|
var seenDigit = false;
|
|
|
|
while (TryPeek(out c))
|
|
{
|
|
if (c == '_')
|
|
{
|
|
Consume();
|
|
continue;
|
|
}
|
|
|
|
if (c is not '0' and not '1')
|
|
break;
|
|
|
|
seenDigit = true;
|
|
parsed <<= 1;
|
|
|
|
if (Consume() == '1')
|
|
parsed += BigInteger.One;
|
|
}
|
|
|
|
if (!seenDigit)
|
|
throw new CompileException(Diagnostic.Error("Expected binary digits after 0b").At(fileName, line, startColumn, column - startColumn).Build());
|
|
|
|
return new TokenIntLiteral(line, startColumn, column - startColumn, parsed);
|
|
}
|
|
default:
|
|
{
|
|
var parsed = BigInteger.Zero;
|
|
|
|
while (TryPeek(out c))
|
|
{
|
|
if (c == '_')
|
|
{
|
|
Consume();
|
|
continue;
|
|
}
|
|
|
|
if (!char.IsDigit(c))
|
|
break;
|
|
|
|
parsed *= 10;
|
|
parsed += Consume() - '0';
|
|
}
|
|
|
|
return new TokenIntLiteral(line, startColumn, column - startColumn, parsed);
|
|
}
|
|
}
|
|
}
|
|
|
|
switch (c)
|
|
{
|
|
case '"':
|
|
{
|
|
Consume();
|
|
var buf = new StringBuilder();
|
|
|
|
while (true)
|
|
{
|
|
if (!TryPeek(out c))
|
|
throw new CompileException(Diagnostic.Error("Unterminated string literal").At(fileName, line, column, 0).Build());
|
|
|
|
if (c == '"')
|
|
break;
|
|
|
|
if (c == '\n')
|
|
throw new CompileException(Diagnostic.Error("Unterminated string literal").At(fileName, line, column, 1).Build());
|
|
|
|
buf.Append(Consume());
|
|
}
|
|
|
|
Consume();
|
|
return new TokenStringLiteral(line, startColumn, column - startColumn, buf.ToString());
|
|
}
|
|
|
|
case '{':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.OpenCurly);
|
|
}
|
|
case '}':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.CloseCurly);
|
|
}
|
|
case '(':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.OpenParen);
|
|
}
|
|
case ')':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.CloseParen);
|
|
}
|
|
case ',':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Comma);
|
|
}
|
|
case '.':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Period);
|
|
}
|
|
case ':' when Peek(1) is ':':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.ColonColon);
|
|
}
|
|
case ':':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Colon);
|
|
}
|
|
case '^':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Caret);
|
|
}
|
|
case '!' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.BangEqual);
|
|
}
|
|
case '!':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Bang);
|
|
}
|
|
case '=' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.EqualEqual);
|
|
}
|
|
case '=':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Equal);
|
|
}
|
|
case '<' when Peek(1) is '<':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.LessThanLessThan);
|
|
}
|
|
case '<' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.LessThanEqual);
|
|
}
|
|
case '<':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.LessThan);
|
|
}
|
|
case '>' when Peek(1) is '>':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.GreaterThanGreaterThan);
|
|
}
|
|
case '>' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.GreaterThanEqual);
|
|
}
|
|
case '>':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.GreaterThan);
|
|
}
|
|
case '+' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.PlusEqual);
|
|
}
|
|
case '+':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Plus);
|
|
}
|
|
case '-' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.MinusEqual);
|
|
}
|
|
case '-':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Minus);
|
|
}
|
|
case '*' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.StarEqual);
|
|
}
|
|
case '*':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Star);
|
|
}
|
|
case '/' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.ForwardSlashEqual);
|
|
}
|
|
case '/':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.ForwardSlash);
|
|
}
|
|
case '%' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.PercentEqual);
|
|
}
|
|
case '%':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Percent);
|
|
}
|
|
case '&' when Peek(1) is '&':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.AmpersandAmpersand);
|
|
}
|
|
case '&':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Ampersand);
|
|
}
|
|
case '|' when Peek(1) is '|':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.PipePipe);
|
|
}
|
|
case '|':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Pipe);
|
|
}
|
|
default:
|
|
{
|
|
if (char.IsLetter(c) || c == '_')
|
|
{
|
|
var buf = new StringBuilder();
|
|
|
|
while (TryPeek(out c) && (char.IsLetterOrDigit(c) || c == '_'))
|
|
buf.Append(Consume());
|
|
|
|
var value = buf.ToString();
|
|
|
|
return value switch
|
|
{
|
|
"func" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Func),
|
|
"struct" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Struct),
|
|
"let" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Let),
|
|
"if" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.If),
|
|
"else" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Else),
|
|
"while" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.While),
|
|
"return" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Return),
|
|
"module" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Module),
|
|
"true" => new TokenBoolLiteral(line, startColumn, column - startColumn, true),
|
|
"false" => new TokenBoolLiteral(line, startColumn, column - startColumn, false),
|
|
_ => new TokenIdent(line, startColumn, column - startColumn, value)
|
|
};
|
|
}
|
|
|
|
throw new CompileException(Diagnostic.Error($"Unexpected character '{c}'").At(fileName, line, column, 1).Build());
|
|
}
|
|
}
|
|
}
|
|
|
|
private bool TryConsume(out char c)
|
|
{
|
|
if (index >= contents.Length)
|
|
{
|
|
c = '\0';
|
|
return false;
|
|
}
|
|
|
|
c = contents[index];
|
|
|
|
if (c == '\n')
|
|
{
|
|
line += 1;
|
|
column = 1;
|
|
}
|
|
else
|
|
{
|
|
column += 1;
|
|
}
|
|
|
|
index += 1;
|
|
|
|
return true;
|
|
}
|
|
|
|
private char Consume()
|
|
{
|
|
if (!TryConsume(out var c))
|
|
throw new CompileException(Diagnostic.Error("Unexpected end of file").At(fileName, line, column, 0).Build());
|
|
|
|
return c;
|
|
}
|
|
|
|
private char? Peek(int offset = 0)
|
|
{
|
|
if (index + offset >= contents.Length)
|
|
return null;
|
|
|
|
return contents[index + offset];
|
|
}
|
|
|
|
private bool TryPeek(out char c)
|
|
{
|
|
if (index >= contents.Length)
|
|
{
|
|
c = '\0';
|
|
return false;
|
|
}
|
|
|
|
c = contents[index];
|
|
return true;
|
|
}
|
|
}
|
|
|
|
public abstract class Token(int line, int column, int length)
|
|
{
|
|
public int Line { get; } = line;
|
|
public int Column { get; } = column;
|
|
public int Length { get; } = length;
|
|
}
|
|
|
|
public sealed class TokenIdent(int line, int column, int length, string ident) : Token(line, column, length)
|
|
{
|
|
public string Ident { get; } = ident;
|
|
}
|
|
|
|
public sealed class TokenIntLiteral(int line, int column, int length, BigInteger value) : Token(line, column, length)
|
|
{
|
|
public BigInteger Value { get; } = value;
|
|
}
|
|
|
|
public sealed class TokenStringLiteral(int line, int column, int length, string value) : Token(line, column, length)
|
|
{
|
|
public string Value { get; } = value;
|
|
}
|
|
|
|
public sealed class TokenBoolLiteral(int line, int column, int length, bool value) : Token(line, column, length)
|
|
{
|
|
public bool Value { get; } = value;
|
|
}
|
|
|
|
public enum Symbol
|
|
{
|
|
OpenCurly,
|
|
CloseCurly,
|
|
OpenParen,
|
|
CloseParen,
|
|
Comma,
|
|
Period,
|
|
Colon,
|
|
ColonColon,
|
|
Caret,
|
|
Bang,
|
|
Equal,
|
|
EqualEqual,
|
|
BangEqual,
|
|
LessThan,
|
|
LessThanLessThan,
|
|
LessThanEqual,
|
|
GreaterThan,
|
|
GreaterThanGreaterThan,
|
|
GreaterThanEqual,
|
|
Plus,
|
|
PlusEqual,
|
|
Minus,
|
|
MinusEqual,
|
|
Star,
|
|
StarEqual,
|
|
ForwardSlash,
|
|
ForwardSlashEqual,
|
|
Percent,
|
|
PercentEqual,
|
|
Ampersand,
|
|
AmpersandAmpersand,
|
|
Pipe,
|
|
PipePipe,
|
|
}
|
|
|
|
public sealed class TokenSymbol(int line, int column, int length, Symbol symbol) : Token(line, column, length)
|
|
{
|
|
public Symbol Symbol { get; } = symbol;
|
|
}
|
|
|
|
public enum Keyword
|
|
{
|
|
Func,
|
|
Struct,
|
|
Let,
|
|
If,
|
|
Else,
|
|
While,
|
|
Return,
|
|
Module,
|
|
}
|
|
|
|
public sealed class TokenKeyword(int line, int column, int length, Keyword keyword) : Token(line, column, length)
|
|
{
|
|
public Keyword Keyword { get; } = keyword;
|
|
}
|
|
|
|
public static class TokenExtensions
|
|
{
|
|
public static string AsString(this Symbol symbol)
|
|
{
|
|
return symbol switch
|
|
{
|
|
Symbol.OpenCurly => "{",
|
|
Symbol.CloseCurly => "}",
|
|
Symbol.OpenParen => "(",
|
|
Symbol.CloseParen => ")",
|
|
Symbol.Comma => ",",
|
|
Symbol.Period => ".",
|
|
Symbol.Colon => ":",
|
|
Symbol.ColonColon => "::",
|
|
Symbol.Caret => "^",
|
|
Symbol.Bang => "!",
|
|
Symbol.Equal => "=",
|
|
Symbol.EqualEqual => "==",
|
|
Symbol.BangEqual => "!=",
|
|
Symbol.LessThan => "<",
|
|
Symbol.LessThanLessThan => "<<",
|
|
Symbol.LessThanEqual => "<=",
|
|
Symbol.GreaterThan => ">",
|
|
Symbol.GreaterThanGreaterThan => ">>",
|
|
Symbol.GreaterThanEqual => ">=",
|
|
Symbol.Plus => "+",
|
|
Symbol.PlusEqual => "+=",
|
|
Symbol.Minus => "-",
|
|
Symbol.MinusEqual => "-=",
|
|
Symbol.Star => "*",
|
|
Symbol.StarEqual => "*=",
|
|
Symbol.ForwardSlash => "/",
|
|
Symbol.ForwardSlashEqual => "/=",
|
|
Symbol.Percent => "%",
|
|
Symbol.PercentEqual => "%=",
|
|
Symbol.Ampersand => "&",
|
|
Symbol.AmpersandAmpersand => "&&",
|
|
Symbol.Pipe => "|",
|
|
Symbol.PipePipe => "||",
|
|
_ => throw new ArgumentOutOfRangeException(nameof(symbol), symbol, null)
|
|
};
|
|
}
|
|
|
|
public static string AsString(this Keyword symbol)
|
|
{
|
|
return symbol switch
|
|
{
|
|
Keyword.Func => "func",
|
|
Keyword.Struct => "struct",
|
|
Keyword.Let => "let",
|
|
Keyword.If => "if",
|
|
Keyword.Else => "else",
|
|
Keyword.While => "while",
|
|
Keyword.Return => "return",
|
|
Keyword.Module => "module",
|
|
_ => throw new ArgumentOutOfRangeException(nameof(symbol), symbol, null)
|
|
};
|
|
}
|
|
} |