using NubLang.Code; using NubLang.Diagnostics; namespace NubLang.Tokenization; public sealed class Tokenizer { private static readonly Dictionary Keywords = new() { ["func"] = Symbol.Func, ["if"] = Symbol.If, ["else"] = Symbol.Else, ["while"] = Symbol.While, ["break"] = Symbol.Break, ["continue"] = Symbol.Continue, ["return"] = Symbol.Return, ["struct"] = Symbol.Struct, ["let"] = Symbol.Let, ["calls"] = Symbol.Calls, ["interface"] = Symbol.Interface, ["for"] = Symbol.For, ["extern"] = Symbol.Extern, ["module"] = Symbol.Module, ["export"] = Symbol.Export, ["import"] = Symbol.Import, }; private static readonly Dictionary Symbols = new() { [['=', '=']] = Symbol.Equal, [['!', '=']] = Symbol.NotEqual, [['<', '=']] = Symbol.LessThanOrEqual, [['>', '=']] = Symbol.GreaterThanOrEqual, [['<', '<']] = Symbol.LeftShift, [['>', '>']] = Symbol.RightShift, [['&', '&']] = Symbol.And, [['|', '|']] = Symbol.Or, [[':']] = Symbol.Colon, [['(']] = Symbol.OpenParen, [[')']] = Symbol.CloseParen, [['{']] = Symbol.OpenBrace, [['}']] = Symbol.CloseBrace, [['[']] = Symbol.OpenBracket, [[']']] = Symbol.CloseBracket, [[',']] = Symbol.Comma, [['.']] = Symbol.Period, [['=']] = Symbol.Assign, [['<']] = Symbol.LessThan, [['>']] = Symbol.GreaterThan, [['+']] = Symbol.Plus, [['-']] = Symbol.Minus, [['*']] = Symbol.Star, [['/']] = Symbol.ForwardSlash, [['!']] = Symbol.Bang, [['^']] = Symbol.Caret, [['&']] = Symbol.Ampersand, [[';']] = Symbol.Semi, [['%']] = Symbol.Percent, [['|']] = Symbol.Pipe, }; private static readonly (char[] Pattern, Symbol Symbol)[] OrderedSymbols = Symbols .OrderByDescending(kvp => kvp.Key.Length) .Select(kvp => (kvp.Key, kvp.Value)) .ToArray(); private readonly SourceFile _sourceFile; private readonly List _diagnostics = []; private int _index; public Tokenizer(SourceFile sourceFile) { _sourceFile = sourceFile; } public IReadOnlyList GetDiagnostics() => _diagnostics; public IEnumerable Tokenize() { _index = 0; while (Peek().TryGetValue(out var current)) { if (char.IsWhiteSpace(current)) { Next(); continue; } if (current == '/' && Peek(1).TryGetValue(out var nextChar) && nextChar == '/') { while (Peek().TryGetValue(out var ch) && ch != '\n') { Next(); } continue; } var tokenStartIndex = _index; if (char.IsLetter(current) || current == '_') { var buffer = string.Empty; while (Peek().TryGetValue(out var next) && (char.IsLetterOrDigit(next) || next == '_')) { buffer += next; Next(); } if (Keywords.TryGetValue(buffer, out var keywordSymbol)) { yield return new SymbolToken(GetSourceFileSpan(tokenStartIndex), keywordSymbol); continue; } if (buffer is "true" or "false") { yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), LiteralKind.Bool, buffer); continue; } yield return new IdentifierToken(GetSourceFileSpan(tokenStartIndex), buffer); continue; } if (char.IsDigit(current)) { var isFloat = false; var buffer = string.Empty; while (Peek().TryGetValue(out var next)) { if (next == '.') { if (isFloat) { throw new Exception("More than one period found in float literal"); } isFloat = true; buffer += next; Next(); } else if (char.IsDigit(next)) { buffer += next; Next(); } else { break; } } yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), isFloat ? LiteralKind.Float : LiteralKind.Integer, buffer); continue; } if (current == '"') { Next(); var buffer = string.Empty; while (true) { if (!Peek().TryGetValue(out var next)) { throw new Exception("Unclosed string literal"); } if (next == '"') { Next(); break; } buffer += next; Next(); } yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), LiteralKind.String, buffer); continue; } var foundMatch = false; foreach (var (pattern, symbol) in OrderedSymbols) { for (var i = 0; i < pattern.Length; i++) { var c = Peek(i); if (!c.HasValue || c.Value != pattern[i]) break; if (i == pattern.Length - 1) { for (var j = 0; j <= i; j++) { Next(); } yield return new SymbolToken(GetSourceFileSpan(tokenStartIndex), symbol); foundMatch = true; break; } } if (foundMatch) { break; } } if (foundMatch) { continue; } _diagnostics.Add(Diagnostic.Error($"Unknown token '{current}'").At(GetSourceFileSpan(tokenStartIndex)).Build()); Next(); } } private Optional Peek(int offset = 0) { if (_index + offset < _sourceFile.GetText().Length) { return _sourceFile.GetText()[_index + offset]; } return Optional.Empty(); } private void Next() { _index++; } private SourceFileSpan GetSourceFileSpan(int tokenStartIndex) { var start = CalculateSourceLocation(tokenStartIndex); var end = CalculateSourceLocation(_index); return new SourceFileSpan(_sourceFile, new SourceSpan(start, end)); } private SourceLocation CalculateSourceLocation(int index) { var line = 1; var column = 1; for (var i = 0; i < index && i < _sourceFile.GetText().Length; i++) { if (_sourceFile.GetText()[i] == '\n') { line++; column = 1; } else { column++; } } return new SourceLocation(line, column); } }