using System.Numerics; using System.Text; namespace Compiler; public sealed class Tokenizer(string contents) { public static List Tokenize(string contents) { return new Tokenizer(contents).Tokenize(); } private int index; private int line = 1; private int column = 1; private List Tokenize() { var tokens = new List(); while (true) { if (!TryPeek(out var c)) break; if (char.IsWhiteSpace(c)) { Consume(); continue; } tokens.Add(ParseToken()); } return tokens; } private Token ParseToken() { var startColumn = column; var c = Peek()!.Value; if (char.IsDigit(c)) { switch (c) { case '0' when Peek(1) is 'x': { Consume(); Consume(); var parsed = BigInteger.Zero; while (TryPeek(out c)) { if (c == '_') { Consume(); continue; } if (!char.IsAsciiHexDigit(c)) break; parsed <<= 4; Consume(); parsed += c switch { >= '0' and <= '9' => c - '0', >= 'a' and <= 'f' => c - 'a' + 10, >= 'A' and <= 'F' => c - 'A' + 10, _ => 0 }; } return new TokenIntLiteral(line, startColumn, column - startColumn, parsed); } case '0' when Peek(1) is 'b': { Consume(); Consume(); var parsed = BigInteger.Zero; while (TryPeek(out c)) { if (c == '_') { Consume(); continue; } if (c is not '0' and not '1') break; parsed <<= 1; if (Consume() == '1') parsed += BigInteger.One; } return new TokenIntLiteral(line, startColumn, column - startColumn, parsed); } default: { var parsed = BigInteger.Zero; while (TryPeek(out c)) { if (c == '_') { Consume(); continue; } if (!char.IsDigit(c)) break; parsed *= 10; parsed += Consume() - '0'; } return new TokenIntLiteral(line, startColumn, column - startColumn, parsed); } } } switch (c) { case '"': { Consume(); var buf = new StringBuilder(); while (TryPeek(out c) && c != '"') buf.Append(Consume()); Consume(); return new TokenStringLiteral(line, startColumn, column - startColumn, buf.ToString()); } case '{': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.OpenCurly); } case '}': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.CloseCurly); } case '(': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.OpenParen); } case ')': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.CloseParen); } case ',': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Comma); } case ':': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Colon); } case '^': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Caret); } case '!' when Peek(1) is '=': { Consume(); Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.BangEqual); } case '!': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Bang); } case '=' when Peek(1) is '=': { Consume(); Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.EqualEqual); } case '=': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Equal); } case '<' when Peek(1) is '=': { Consume(); Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.LessThanEqual); } case '<': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.LessThan); } case '>' when Peek(1) is '=': { Consume(); Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.GreaterThanEqual); } case '>': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.GreaterThan); } case '+' when Peek(1) is '=': { Consume(); Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.PlusEqual); } case '+': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Plus); } case '-' when Peek(1) is '=': { Consume(); Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.MinusEqual); } case '-': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Minus); } case '*' when Peek(1) is '=': { Consume(); Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.StarEqual); } case '*': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.Star); } case '/' when Peek(1) is '=': { Consume(); Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.ForwardSlashEqual); } case '/': { Consume(); return new TokenSymbol(line, startColumn, column - startColumn, Symbol.ForwardSlash); } default: { if (char.IsLetter(c) || c == '_') { var buf = new StringBuilder(); while (TryPeek(out c) && (char.IsLetterOrDigit(c) || c == '_')) buf.Append(Consume()); var value = buf.ToString(); return value switch { "func" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Func), "let" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Let), "if" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.If), "else" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Else), "return" => new TokenKeyword(line, startColumn, column - startColumn, Keyword.Return), "true" => new TokenBoolLiteral(line, startColumn, column - startColumn, true), "false" => new TokenBoolLiteral(line, startColumn, column - startColumn, false), _ => new TokenIdent(line, startColumn, column - startColumn, value) }; } throw new Exception($"Unexpected character '{c}'"); } } } private char Consume() { if (index >= contents.Length) throw new Exception("End of tokens"); var c = contents[index]; if (c == '\n') { line += 1; column = 1; } else { column += 1; } index += 1; return c; } private char? Peek(int offset = 0) { if (index + offset >= contents.Length) return null; return contents[index + offset]; } private bool TryPeek(out char c) { if (index >= contents.Length) { c = '\0'; return false; } c = contents[index]; return true; } } public abstract class Token(int line, int column, int length) { public int Line = line; public int Column = column; public int Length = length; } public sealed class TokenIdent(int line, int column, int length, string ident) : Token(line, column, length) { public readonly string Ident = ident; } public sealed class TokenIntLiteral(int line, int column, int length, BigInteger value) : Token(line, column, length) { public BigInteger Value = value; } public sealed class TokenStringLiteral(int line, int column, int length, string value) : Token(line, column, length) { public readonly string Value = value; } public sealed class TokenBoolLiteral(int line, int column, int length, bool value) : Token(line, column, length) { public readonly bool Value = value; } public enum Symbol { OpenCurly, CloseCurly, OpenParen, CloseParen, Comma, Colon, Caret, Bang, Equal, EqualEqual, BangEqual, LessThan, LessThanEqual, GreaterThan, GreaterThanEqual, Plus, PlusEqual, Minus, MinusEqual, Star, StarEqual, ForwardSlash, ForwardSlashEqual, } public sealed class TokenSymbol(int line, int column, int length, Symbol symbol) : Token(line, column, length) { public readonly Symbol Symbol = symbol; } public enum Keyword { Func, Let, If, Else, Return, } public sealed class TokenKeyword(int line, int column, int length, Keyword keyword) : Token(line, column, length) { public readonly Keyword Keyword = keyword; }