using NubLang.Diagnostics; namespace NubLang.Syntax; public sealed class Tokenizer { private static readonly Dictionary Keywords = new() { ["func"] = Symbol.Func, ["if"] = Symbol.If, ["else"] = Symbol.Else, ["while"] = Symbol.While, ["break"] = Symbol.Break, ["continue"] = Symbol.Continue, ["return"] = Symbol.Return, ["struct"] = Symbol.Struct, ["let"] = Symbol.Let, ["extern"] = Symbol.Extern, ["module"] = Symbol.Module, ["export"] = Symbol.Export, ["import"] = Symbol.Import, ["defer"] = Symbol.Defer, }; private static readonly Dictionary Symbols = new() { [['=', '=']] = Symbol.Equal, [['!', '=']] = Symbol.NotEqual, [['<', '=']] = Symbol.LessThanOrEqual, [['>', '=']] = Symbol.GreaterThanOrEqual, [['<', '<']] = Symbol.LeftShift, [['>', '>']] = Symbol.RightShift, [['&', '&']] = Symbol.And, [['|', '|']] = Symbol.Or, [[':', ':']] = Symbol.DoubleColon, [[':']] = Symbol.Colon, [['(']] = Symbol.OpenParen, [[')']] = Symbol.CloseParen, [['{']] = Symbol.OpenBrace, [['}']] = Symbol.CloseBrace, [['[']] = Symbol.OpenBracket, [[']']] = Symbol.CloseBracket, [[',']] = Symbol.Comma, [['.']] = Symbol.Period, [['=']] = Symbol.Assign, [['<']] = Symbol.LessThan, [['>']] = Symbol.GreaterThan, [['+']] = Symbol.Plus, [['-']] = Symbol.Minus, [['*']] = Symbol.Star, [['/']] = Symbol.ForwardSlash, [['!']] = Symbol.Bang, [['^']] = Symbol.Caret, [['&']] = Symbol.Ampersand, [[';']] = Symbol.Semi, [['%']] = Symbol.Percent, [['|']] = Symbol.Pipe, [['@']] = Symbol.At, }; private static readonly (char[] Pattern, Symbol Symbol)[] OrderedSymbols = Symbols .OrderByDescending(kvp => kvp.Key.Length) .Select(kvp => (kvp.Key, kvp.Value)) .ToArray(); private readonly string _fileName; private readonly string _content; private int _index = 0; private int _line = 1; private int _column = 1; public Tokenizer(string fileName, string content) { _fileName = fileName; _content = content; } public List Diagnostics { get; } = []; public List Tokens { get; } = []; public void Tokenize() { Diagnostics.Clear(); Tokens.Clear(); _index = 0; _line = 1; _column = 1; while (Peek().HasValue) { try { // Skip whitespace and increment line counter if newline var current = Peek()!.Value; if (char.IsWhiteSpace(current)) { if (current is '\n') { _line += 1; _column = 1; } Next(); continue; } // Skip single line comments but keep newline so next iteration increments the line counter if (current == '/' && Peek(1) == '/') { while (Peek() is not '\n') { Next(); } continue; } Tokens.Add(ParseToken(current, _line, _column)); } catch (TokenizerException e) { Diagnostics.Add(e.Diagnostic); Next(); } } } private Token ParseToken(char current, int lineStart, int columnStart) { if (char.IsLetter(current) || current == '_') { var buffer = string.Empty; while (Peek() != null && (char.IsLetterOrDigit(Peek()!.Value) || Peek() == '_')) { buffer += Peek(); Next(); } if (Keywords.TryGetValue(buffer, out var keywordSymbol)) { return new SymbolToken(CreateSpan(lineStart, columnStart), keywordSymbol); } if (buffer is "true" or "false") { return new BoolLiteralToken(CreateSpan(lineStart, columnStart), Convert.ToBoolean(buffer)); } return new IdentifierToken(CreateSpan(lineStart, columnStart), buffer); } if (char.IsDigit(current)) { var buffer = string.Empty; if (current == '0' && Peek(1) is 'x') { buffer += "0x"; Next(); Next(); while (Peek() != null && Uri.IsHexDigit(Peek()!.Value)) { buffer += Peek()!.Value; Next(); } if (buffer.Length <= 2) { throw new TokenizerException(Diagnostic .Error("Invalid hex literal, no digits found") .At(_fileName, _line, _column) .Build()); } return new IntLiteralToken(CreateSpan(lineStart, columnStart), buffer, 16); } if (current == '0' && Peek(1) is 'b') { buffer += "0b"; Next(); Next(); while (Peek() != null && (Peek() == '0' || Peek() == '1')) { buffer += Peek()!.Value; Next(); } if (buffer.Length <= 2) { throw new TokenizerException(Diagnostic .Error("Invalid binary literal, no digits found") .At(_fileName, _line, _column) .Build()); } return new IntLiteralToken(CreateSpan(lineStart, columnStart), buffer, 2); } var isFloat = false; while (Peek() != null) { var next = Peek()!.Value; if (next == '.') { if (isFloat) { throw new TokenizerException(Diagnostic .Error("More than one period found in float literal") .At(_fileName, _line, _column) .Build()); } isFloat = true; buffer += next; Next(); } else if (char.IsDigit(next)) { buffer += next; Next(); } else { break; } } if (isFloat) { return new FloatLiteralToken(CreateSpan(lineStart, columnStart), buffer); } else { return new IntLiteralToken(CreateSpan(lineStart, columnStart), buffer, 10); } } if (current == '"') { Next(); var buffer = string.Empty; while (true) { var next = Peek(); if (!next.HasValue) { throw new TokenizerException(Diagnostic .Error("Unclosed string literal") .At(_fileName, _line, _column) .Build()); } if (next is '\n') { _line += 1; break; } if (next is '"') { Next(); break; } buffer += next; Next(); } return new StringLiteralToken(CreateSpan(lineStart, columnStart), buffer); } foreach (var (pattern, symbol) in OrderedSymbols) { for (var i = 0; i < pattern.Length; i++) { var c = Peek(i); if (!c.HasValue || c.Value != pattern[i]) break; if (i == pattern.Length - 1) { for (var j = 0; j <= i; j++) { Next(); } return new SymbolToken(CreateSpan(lineStart, columnStart), symbol); } } } throw new TokenizerException(Diagnostic.Error($"Unknown token '{current}'").Build()); } private SourceSpan CreateSpan(int lineStart, int columnStart) { return new SourceSpan(_fileName, new SourceLocation(lineStart, columnStart), new SourceLocation(_line, _column)); } private char? Peek(int offset = 0) { if (_index + offset < _content.Length) { return _content[_index + offset]; } return null; } private void Next() { _index += 1; _column += 1; } } public class TokenizerException : Exception { public Diagnostic Diagnostic { get; } public TokenizerException(Diagnostic diagnostic) : base(diagnostic.Message) { Diagnostic = diagnostic; } }