407 lines
11 KiB
C#
407 lines
11 KiB
C#
using System.Numerics;
|
|
using System.Text;
|
|
|
|
namespace Compiler;
|
|
|
|
public sealed class Tokenizer(string contents)
|
|
{
|
|
public static List<Token> Tokenize(string contents)
|
|
{
|
|
return new Tokenizer(contents).Tokenize();
|
|
}
|
|
|
|
private int index;
|
|
private int line = 1;
|
|
private int column = 1;
|
|
|
|
private List<Token> Tokenize()
|
|
{
|
|
var tokens = new List<Token>();
|
|
|
|
while (true)
|
|
{
|
|
if (!TryPeek(out var c))
|
|
break;
|
|
|
|
if (char.IsWhiteSpace(c))
|
|
{
|
|
Consume();
|
|
continue;
|
|
}
|
|
|
|
tokens.Add(ParseToken());
|
|
}
|
|
|
|
return tokens;
|
|
}
|
|
|
|
private Token ParseToken()
|
|
{
|
|
var startColumn = column;
|
|
var c = Peek()!.Value;
|
|
|
|
if (char.IsDigit(c))
|
|
{
|
|
switch (c)
|
|
{
|
|
case '0' when Peek(1) is 'x':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
|
|
var parsed = BigInteger.Zero;
|
|
|
|
while (TryPeek(out c))
|
|
{
|
|
if (c == '_')
|
|
{
|
|
Consume();
|
|
continue;
|
|
}
|
|
|
|
if (!char.IsAsciiHexDigit(c))
|
|
break;
|
|
|
|
parsed <<= 4;
|
|
|
|
Consume();
|
|
parsed += c switch
|
|
{
|
|
>= '0' and <= '9' => c - '0',
|
|
>= 'a' and <= 'f' => c - 'a' + 10,
|
|
>= 'A' and <= 'F' => c - 'A' + 10,
|
|
_ => 0
|
|
};
|
|
}
|
|
|
|
return new TokenIntLiteral(line, startColumn, column - startColumn, parsed);
|
|
}
|
|
case '0' when Peek(1) is 'b':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
|
|
var parsed = BigInteger.Zero;
|
|
|
|
while (TryPeek(out c))
|
|
{
|
|
if (c == '_')
|
|
{
|
|
Consume();
|
|
continue;
|
|
}
|
|
|
|
if (c is not '0' and not '1')
|
|
break;
|
|
|
|
parsed <<= 1;
|
|
if (Consume() == '1')
|
|
parsed += BigInteger.One;
|
|
}
|
|
|
|
return new TokenIntLiteral(line, startColumn, column - startColumn, parsed);
|
|
}
|
|
default:
|
|
{
|
|
var parsed = BigInteger.Zero;
|
|
|
|
while (TryPeek(out c))
|
|
{
|
|
if (c == '_')
|
|
{
|
|
Consume();
|
|
continue;
|
|
}
|
|
|
|
if (!char.IsDigit(c))
|
|
break;
|
|
|
|
parsed *= 10;
|
|
parsed += Consume() - '0';
|
|
}
|
|
|
|
return new TokenIntLiteral(line, startColumn, column - startColumn, parsed);
|
|
}
|
|
}
|
|
}
|
|
|
|
switch (c)
|
|
{
|
|
case '"':
|
|
{
|
|
Consume();
|
|
|
|
var buf = new StringBuilder();
|
|
|
|
while (TryPeek(out c) && c != '"')
|
|
buf.Append(Consume());
|
|
|
|
Consume();
|
|
|
|
return new TokenStringLiteral(line, startColumn, column - startColumn, buf.ToString());
|
|
}
|
|
case '{':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.OpenCurly);
|
|
}
|
|
case '}':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.CloseCurly);
|
|
}
|
|
case '(':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.OpenParen);
|
|
}
|
|
case ')':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.CloseParen);
|
|
}
|
|
case ',':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Comma);
|
|
}
|
|
case ':':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Colon);
|
|
}
|
|
case '^':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Caret);
|
|
}
|
|
case '!' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.BangEqual);
|
|
}
|
|
case '!':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Bang);
|
|
}
|
|
case '=' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.EqualEqual);
|
|
}
|
|
case '=':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Equal);
|
|
}
|
|
case '<' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.LessThanEqual);
|
|
}
|
|
case '<':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.LessThan);
|
|
}
|
|
case '>' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.GreaterThanEqual);
|
|
}
|
|
case '>':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.GreaterThan);
|
|
}
|
|
case '+' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.PlusEqual);
|
|
}
|
|
case '+':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Plus);
|
|
}
|
|
case '-' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.MinusEqual);
|
|
}
|
|
case '-':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Minus);
|
|
}
|
|
case '*' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.StarEqual);
|
|
}
|
|
case '*':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Star);
|
|
}
|
|
case '/' when Peek(1) is '=':
|
|
{
|
|
Consume();
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.ForwardSlashEqual);
|
|
}
|
|
case '/':
|
|
{
|
|
Consume();
|
|
return new TokenSymbol(line, startColumn, column - startColumn, Symbol.ForwardSlash);
|
|
}
|
|
default:
|
|
{
|
|
if (char.IsLetter(c) || c == '_')
|
|
{
|
|
var buf = new StringBuilder();
|
|
|
|
while (TryPeek(out c) && (char.IsLetterOrDigit(c) || c == '_'))
|
|
buf.Append(Consume());
|
|
|
|
var value = buf.ToString();
|
|
|
|
return value switch
|
|
{
|
|
"func" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Func),
|
|
"let" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Let),
|
|
"if" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.If),
|
|
"return" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Return),
|
|
"true" => new TokenBoolLiteral(line, startColumn, column - startColumn, true),
|
|
"false" => new TokenBoolLiteral(line, startColumn, column - startColumn, false),
|
|
_ => new TokenIdent(line, startColumn, column - startColumn, value)
|
|
};
|
|
}
|
|
|
|
throw new Exception($"Unexpected character '{c}'");
|
|
}
|
|
}
|
|
}
|
|
|
|
private char Consume()
|
|
{
|
|
if (index >= contents.Length)
|
|
throw new Exception("End of tokens");
|
|
|
|
var c = contents[index];
|
|
|
|
if (c == '\n')
|
|
{
|
|
line += 1;
|
|
column = 1;
|
|
}
|
|
else
|
|
{
|
|
column += 1;
|
|
}
|
|
|
|
index += 1;
|
|
|
|
return c;
|
|
}
|
|
|
|
private char? Peek(int offset = 0)
|
|
{
|
|
if (index + offset >= contents.Length)
|
|
return null;
|
|
|
|
return contents[index + offset];
|
|
}
|
|
|
|
private bool TryPeek(out char c)
|
|
{
|
|
if (index >= contents.Length)
|
|
{
|
|
c = '\0';
|
|
return false;
|
|
}
|
|
|
|
c = contents[index];
|
|
return true;
|
|
}
|
|
}
|
|
|
|
public abstract class Token(int line, int column, int length)
|
|
{
|
|
public int Line = line;
|
|
public int Column = column;
|
|
public int Length = length;
|
|
}
|
|
|
|
public sealed class TokenIdent(int line, int column, int length, string ident) : Token(line, column, length)
|
|
{
|
|
public readonly string Ident = ident;
|
|
}
|
|
|
|
public sealed class TokenIntLiteral(int line, int column, int length, BigInteger value) : Token(line, column, length)
|
|
{
|
|
public BigInteger Value = value;
|
|
}
|
|
|
|
public sealed class TokenStringLiteral(int line, int column, int length, string value) : Token(line, column, length)
|
|
{
|
|
public readonly string Value = value;
|
|
}
|
|
|
|
public sealed class TokenBoolLiteral(int line, int column, int length, bool value) : Token(line, column, length)
|
|
{
|
|
public readonly bool Value = value;
|
|
}
|
|
|
|
public enum Symbol
|
|
{
|
|
OpenCurly,
|
|
CloseCurly,
|
|
OpenParen,
|
|
CloseParen,
|
|
Comma,
|
|
Colon,
|
|
Caret,
|
|
Bang,
|
|
Equal,
|
|
EqualEqual,
|
|
BangEqual,
|
|
LessThan,
|
|
LessThanEqual,
|
|
GreaterThan,
|
|
GreaterThanEqual,
|
|
Plus,
|
|
PlusEqual,
|
|
Minus,
|
|
MinusEqual,
|
|
Star,
|
|
StarEqual,
|
|
ForwardSlash,
|
|
ForwardSlashEqual,
|
|
}
|
|
|
|
public sealed class TokenSymbol(int line, int column, int length, Symbol symbol) : Token(line, column, length)
|
|
{
|
|
public readonly Symbol Symbol = symbol;
|
|
}
|
|
|
|
public enum Keyword
|
|
{
|
|
Func,
|
|
Let,
|
|
If,
|
|
Return,
|
|
}
|
|
|
|
public sealed class TokenKeyword(int line, int column, int length, Keyword keyword) : Token(line, column, length)
|
|
{
|
|
public readonly Keyword Keyword = keyword;
|
|
} |