Perf improvements in tokenizer

This commit is contained in:
nub31
2025-09-29 14:11:48 +02:00
parent b5799633bf
commit c74394a849
10 changed files with 219 additions and 274 deletions

View File

@@ -68,171 +68,196 @@ public sealed class Tokenizer
.Select(kvp => (kvp.Key, kvp.Value))
.ToArray();
private readonly SourceFile _sourceFile;
private readonly List<Diagnostic> _diagnostics = [];
private int _index;
private readonly string _fileName;
private readonly string _content;
private int _index = 0;
private int _line = 1;
private int _column = 1;
public Tokenizer(SourceFile sourceFile)
public Tokenizer(string fileName, string content)
{
_sourceFile = sourceFile;
_fileName = fileName;
_content = content;
}
public List<Diagnostic> GetDiagnostics() => _diagnostics;
public List<Diagnostic> Diagnostics { get; } = [];
public List<Token> Tokens { get; } = [];
public IEnumerable<Token> Tokenize()
public void Tokenize()
{
Diagnostics.Clear();
Tokens.Clear();
_index = 0;
_line = 1;
_column = 1;
while (Peek() != null)
while (Peek().HasValue)
{
var current = Peek()!.Value;
if (char.IsWhiteSpace(current))
try
{
Next();
continue;
}
if (current == '/' && Peek(1) == '/')
{
while (Peek().HasValue && Peek() != '\n')
// Skip whitespace and increment line counter if newline
var current = Peek()!.Value;
if (char.IsWhiteSpace(current))
{
if (current is '\n')
{
_line += 1;
_column = 1;
}
Next();
}
continue;
}
var tokenStartIndex = _index;
if (char.IsLetter(current) || current == '_')
{
var buffer = string.Empty;
while (Peek() != null && (char.IsLetterOrDigit(Peek()!.Value) || Peek() == '_'))
{
buffer += Peek();
Next();
}
if (Keywords.TryGetValue(buffer, out var keywordSymbol))
{
yield return new SymbolToken(GetSourceFileSpan(tokenStartIndex), keywordSymbol);
continue;
}
if (buffer is "true" or "false")
// Skip single line comments but keep newline so next iteration increments the line counter
if (current == '/' && Peek(1) == '/')
{
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), LiteralKind.Bool, buffer);
while (Peek() is not '\n')
{
Next();
}
continue;
}
yield return new IdentifierToken(GetSourceFileSpan(tokenStartIndex), buffer);
continue;
Tokens.Add(ParseToken(current, _line, _column));
}
if (char.IsDigit(current))
{
var isFloat = false;
var buffer = string.Empty;
while (Peek() != null)
{
var next = Peek()!.Value;
if (next == '.')
{
if (isFloat)
{
throw new Exception("More than one period found in float literal");
}
isFloat = true;
buffer += next;
Next();
}
else if (char.IsDigit(next))
{
buffer += next;
Next();
}
else
{
break;
}
}
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), isFloat ? LiteralKind.Float : LiteralKind.Integer, buffer);
continue;
}
if (current == '"')
catch (TokenizerException e)
{
Diagnostics.Add(e.Diagnostic);
Next();
var buffer = string.Empty;
}
}
}
while (true)
private Token ParseToken(char current, int lineStart, int columnStart)
{
if (char.IsLetter(current) || current == '_')
{
var buffer = string.Empty;
while (Peek() != null && (char.IsLetterOrDigit(Peek()!.Value) || Peek() == '_'))
{
buffer += Peek();
Next();
}
if (Keywords.TryGetValue(buffer, out var keywordSymbol))
{
return new SymbolToken(_fileName, CreateSpan(lineStart, columnStart), keywordSymbol);
}
if (buffer is "true" or "false")
{
return new LiteralToken(_fileName, CreateSpan(lineStart, columnStart), LiteralKind.Bool, buffer);
}
return new IdentifierToken(_fileName, CreateSpan(lineStart, columnStart), buffer);
}
if (char.IsDigit(current))
{
var isFloat = false;
var buffer = string.Empty;
while (Peek() != null)
{
var next = Peek()!.Value;
if (next == '.')
{
if (Peek() == null)
if (isFloat)
{
throw new Exception("Unclosed string literal");
}
var next = Peek()!.Value;
if (next == '"')
{
Next();
break;
throw new TokenizerException(Diagnostic
.Error("More than one period found in float literal")
.At(_fileName, _line, _column)
.Build());
}
isFloat = true;
buffer += next;
Next();
}
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), LiteralKind.String, buffer);
continue;
}
var foundMatch = false;
foreach (var (pattern, symbol) in OrderedSymbols)
{
for (var i = 0; i < pattern.Length; i++)
else if (char.IsDigit(next))
{
var c = Peek(i);
if (!c.HasValue || c.Value != pattern[i]) break;
if (i == pattern.Length - 1)
{
for (var j = 0; j <= i; j++)
{
Next();
}
yield return new SymbolToken(GetSourceFileSpan(tokenStartIndex), symbol);
foundMatch = true;
break;
}
buffer += next;
Next();
}
if (foundMatch)
else
{
break;
}
}
if (foundMatch)
return new LiteralToken(_fileName, CreateSpan(lineStart, columnStart), isFloat ? LiteralKind.Float : LiteralKind.Integer, buffer);
}
if (current == '"')
{
Next();
var buffer = string.Empty;
while (true)
{
continue;
var next = Peek();
if (!next.HasValue)
{
throw new TokenizerException(Diagnostic
.Error("Unclosed string literal")
.At(_fileName, _line, _column)
.Build());
}
if (next is '\n')
{
_line += 1;
break;
}
if (next is '"')
{
Next();
break;
}
buffer += next;
Next();
}
_diagnostics.Add(Diagnostic.Error($"Unknown token '{current}'").At(GetSourceFileSpan(tokenStartIndex)).Build());
Next();
return new LiteralToken(_fileName, CreateSpan(lineStart, columnStart), LiteralKind.String, buffer);
}
foreach (var (pattern, symbol) in OrderedSymbols)
{
for (var i = 0; i < pattern.Length; i++)
{
var c = Peek(i);
if (!c.HasValue || c.Value != pattern[i]) break;
if (i == pattern.Length - 1)
{
for (var j = 0; j <= i; j++)
{
Next();
}
return new SymbolToken(_fileName, CreateSpan(lineStart, columnStart), symbol);
}
}
}
throw new TokenizerException(Diagnostic.Error($"Unknown token '{current}'").Build());
}
private SourceSpan CreateSpan(int lineStart, int columnStart)
{
return new SourceSpan(_fileName, new SourceLocation(lineStart, columnStart), new SourceLocation(_line, _column));
}
private char? Peek(int offset = 0)
{
if (_index + offset < _sourceFile.GetText().Length)
if (_index + offset < _content.Length)
{
return _sourceFile.GetText()[_index + offset];
return _content[_index + offset];
}
return null;
@@ -240,34 +265,17 @@ public sealed class Tokenizer
private void Next()
{
_index++;
_index += 1;
_column += 1;
}
}
private SourceFileSpan GetSourceFileSpan(int tokenStartIndex)
public class TokenizerException : Exception
{
public Diagnostic Diagnostic { get; }
public TokenizerException(Diagnostic diagnostic) : base(diagnostic.Message)
{
var start = CalculateSourceLocation(tokenStartIndex);
var end = CalculateSourceLocation(_index);
return new SourceFileSpan(_sourceFile, new SourceSpan(start, end));
}
private SourceLocation CalculateSourceLocation(int index)
{
var line = 1;
var column = 1;
for (var i = 0; i < index && i < _sourceFile.GetText().Length; i++)
{
if (_sourceFile.GetText()[i] == '\n')
{
line++;
column = 1;
}
else
{
column++;
}
}
return new SourceLocation(line, column);
Diagnostic = diagnostic;
}
}