using NubLang.Diagnostics; namespace NubLang.Syntax; public sealed class Tokenizer { private string _fileName = null!; private string _content = null!; private int _index; private int _line = 1; private int _column = 1; public List Diagnostics { get; set; } = new(16); public List Tokenize(string fileName, string content) { _fileName = fileName; _content = content; Diagnostics = []; _index = 0; _line = 1; _column = 1; var tokens = new List(); while (_index < _content.Length) { try { var current = _content[_index]; if (char.IsWhiteSpace(current)) { if (current == '\n') { _line += 1; _column = 0; } Next(); continue; } if (current == '/' && _index + 1 < _content.Length && _content[_index + 1] == '/') { Next(2); while (_index < _content.Length && _content[_index] != '\n') { Next(); } continue; } tokens.Add(ParseToken(current, _line, _column)); } catch (CompileException e) { Diagnostics.Add(e.Diagnostic); Next(); } } return tokens; } private Token ParseToken(char current, int lineStart, int columnStart) { if (char.IsDigit(current)) { return ParseNumber(lineStart, columnStart); } if (current == '"') { return ParseString(lineStart, columnStart); } // note(nub31): Look for keywords (longest first in case a keyword fits partially in a larger keyword) for (var i = 8; i >= 1; i--) { if (TryMatchSymbol(i, lineStart, columnStart, out var token)) { return token; } } if (char.IsLetter(current) || current == '_') { return ParseIdentifier(lineStart, columnStart); } throw new CompileException(Diagnostic.Error($"Unknown token '{current}'").Build()); } private Token ParseNumber(int lineStart, int columnStart) { var start = _index; var current = _content[_index]; // note(nub31): 0xFFFFFF if (current == '0' && _index + 1 < _content.Length && _content[_index + 1] == 'x') { Next(2); var digitStart = _index; while (_index < _content.Length && Uri.IsHexDigit(_content[_index])) { Next(); } if (_index == digitStart) { throw new CompileException(Diagnostic .Error("Invalid hex literal, no digits found") .At(_fileName, _line, _column) .Build()); } return new IntLiteralToken( CreateSpan(lineStart, columnStart), _content.Substring(start, _index - start), 16); } // note(nub31): 0b11001100 if (current == '0' && _index + 1 < _content.Length && _content[_index + 1] == 'b') { Next(2); var digitStart = _index; while (_index < _content.Length && (_content[_index] == '0' || _content[_index] == '1')) { Next(); } if (_index == digitStart) { throw new CompileException(Diagnostic .Error("Invalid binary literal, no digits found") .At(_fileName, _line, _column) .Build()); } return new IntLiteralToken( CreateSpan(lineStart, columnStart), _content.Substring(start, _index - start), 2); } // note(nub31): 23/23.5 var isFloat = false; while (_index < _content.Length) { var next = _content[_index]; if (next == '.') { if (isFloat) { throw new CompileException(Diagnostic .Error("More than one period found in float literal") .At(_fileName, _line, _column) .Build()); } isFloat = true; Next(); } else if (char.IsDigit(next)) { Next(); } else { break; } } var buffer = _content.Substring(start, _index - start); return isFloat ? new FloatLiteralToken(CreateSpan(lineStart, columnStart), buffer) : new IntLiteralToken(CreateSpan(lineStart, columnStart), buffer, 10); } private StringLiteralToken ParseString(int lineStart, int columnStart) { Next(); var start = _index; while (true) { if (_index >= _content.Length) { throw new CompileException(Diagnostic .Error("Unclosed string literal") .At(_fileName, _line, _column) .Build()); } var next = _content[_index]; if (next == '\n') { throw new CompileException(Diagnostic .Error("Unclosed string literal (newline found)") .At(_fileName, _line, _column) .Build()); } if (next == '"') { var buffer = _content.Substring(start, _index - start); Next(); return new StringLiteralToken(CreateSpan(lineStart, columnStart), buffer); } Next(); } } private bool TryMatchSymbol(int length, int lineStart, int columnStart, out Token token) { token = null!; if (_index + length > _content.Length) { return false; } var span = _content.AsSpan(_index, length); if (span is "true") { Next(4); token = new BoolLiteralToken(CreateSpan(lineStart, columnStart), true); return true; } if (span is "false") { Next(5); token = new BoolLiteralToken(CreateSpan(lineStart, columnStart), false); return true; } var symbol = length switch { 8 => span switch { "continue" => Symbol.Continue, _ => Symbol.None }, 6 => span switch { "return" => Symbol.Return, "struct" => Symbol.Struct, "extern" => Symbol.Extern, "packed" => Symbol.Packed, "module" => Symbol.Module, "export" => Symbol.Export, _ => Symbol.None }, 5 => span switch { "break" => Symbol.Break, "while" => Symbol.While, "defer" => Symbol.Defer, _ => Symbol.None }, 4 => span switch { "func" => Symbol.Func, "else" => Symbol.Else, "enum" => Symbol.Enum, _ => Symbol.None }, 3 => span switch { "for" => Symbol.For, "let" => Symbol.Let, _ => Symbol.None }, 2 => span switch { "if" => Symbol.If, "in" => Symbol.In, "==" => Symbol.Equal, "!=" => Symbol.NotEqual, "<=" => Symbol.LessThanOrEqual, ">=" => Symbol.GreaterThanOrEqual, "<<" => Symbol.LeftShift, ">>" => Symbol.RightShift, "&&" => Symbol.And, "||" => Symbol.Or, "::" => Symbol.DoubleColon, _ => Symbol.None }, 1 => span[0] switch { ':' => Symbol.Colon, '(' => Symbol.OpenParen, ')' => Symbol.CloseParen, '{' => Symbol.OpenBrace, '}' => Symbol.CloseBrace, '[' => Symbol.OpenBracket, ']' => Symbol.CloseBracket, ',' => Symbol.Comma, '.' => Symbol.Period, '=' => Symbol.Assign, '<' => Symbol.LessThan, '>' => Symbol.GreaterThan, '+' => Symbol.Plus, '-' => Symbol.Minus, '*' => Symbol.Star, '/' => Symbol.ForwardSlash, '!' => Symbol.Bang, '^' => Symbol.Caret, '&' => Symbol.Ampersand, ';' => Symbol.Semi, '%' => Symbol.Percent, '|' => Symbol.Pipe, '@' => Symbol.At, '?' => Symbol.QuestionMark, '~' => Symbol.Tilde, _ => Symbol.None }, _ => Symbol.None }; if (symbol != Symbol.None) { var isAlphaKeyword = char.IsLetter(span[0]); if (isAlphaKeyword) { var nextIdx = _index + length; if (nextIdx < _content.Length) { var nextChar = _content[nextIdx]; if (char.IsLetterOrDigit(nextChar) || nextChar == '_') { return false; } } } Next(length); token = new SymbolToken(CreateSpan(lineStart, columnStart), symbol); return true; } return false; } private IdentifierToken ParseIdentifier(int lineStart, int columnStart) { var start = _index; while (_index < _content.Length) { var ch = _content[_index]; if (char.IsLetterOrDigit(ch) || ch == '_') { Next(); } else { break; } } return new IdentifierToken(CreateSpan(lineStart, columnStart), _content.Substring(start, _index - start)); } private SourceSpan CreateSpan(int lineStart, int columnStart) { return new SourceSpan(_fileName, new SourceLocation(lineStart, columnStart), new SourceLocation(_line, _column)); } private void Next(int count = 1) { _index += count; _column += count; } }