Perf improvements in tokenizer
This commit is contained in:
@@ -68,171 +68,196 @@ public sealed class Tokenizer
|
||||
.Select(kvp => (kvp.Key, kvp.Value))
|
||||
.ToArray();
|
||||
|
||||
private readonly SourceFile _sourceFile;
|
||||
private readonly List<Diagnostic> _diagnostics = [];
|
||||
private int _index;
|
||||
private readonly string _fileName;
|
||||
private readonly string _content;
|
||||
private int _index = 0;
|
||||
private int _line = 1;
|
||||
private int _column = 1;
|
||||
|
||||
public Tokenizer(SourceFile sourceFile)
|
||||
public Tokenizer(string fileName, string content)
|
||||
{
|
||||
_sourceFile = sourceFile;
|
||||
_fileName = fileName;
|
||||
_content = content;
|
||||
}
|
||||
|
||||
public List<Diagnostic> GetDiagnostics() => _diagnostics;
|
||||
public List<Diagnostic> Diagnostics { get; } = [];
|
||||
public List<Token> Tokens { get; } = [];
|
||||
|
||||
public IEnumerable<Token> Tokenize()
|
||||
public void Tokenize()
|
||||
{
|
||||
Diagnostics.Clear();
|
||||
Tokens.Clear();
|
||||
_index = 0;
|
||||
_line = 1;
|
||||
_column = 1;
|
||||
|
||||
while (Peek() != null)
|
||||
while (Peek().HasValue)
|
||||
{
|
||||
var current = Peek()!.Value;
|
||||
if (char.IsWhiteSpace(current))
|
||||
try
|
||||
{
|
||||
Next();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current == '/' && Peek(1) == '/')
|
||||
{
|
||||
while (Peek().HasValue && Peek() != '\n')
|
||||
// Skip whitespace and increment line counter if newline
|
||||
var current = Peek()!.Value;
|
||||
if (char.IsWhiteSpace(current))
|
||||
{
|
||||
if (current is '\n')
|
||||
{
|
||||
_line += 1;
|
||||
_column = 1;
|
||||
}
|
||||
|
||||
Next();
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
var tokenStartIndex = _index;
|
||||
|
||||
if (char.IsLetter(current) || current == '_')
|
||||
{
|
||||
var buffer = string.Empty;
|
||||
|
||||
while (Peek() != null && (char.IsLetterOrDigit(Peek()!.Value) || Peek() == '_'))
|
||||
{
|
||||
buffer += Peek();
|
||||
Next();
|
||||
}
|
||||
|
||||
if (Keywords.TryGetValue(buffer, out var keywordSymbol))
|
||||
{
|
||||
yield return new SymbolToken(GetSourceFileSpan(tokenStartIndex), keywordSymbol);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (buffer is "true" or "false")
|
||||
// Skip single line comments but keep newline so next iteration increments the line counter
|
||||
if (current == '/' && Peek(1) == '/')
|
||||
{
|
||||
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), LiteralKind.Bool, buffer);
|
||||
while (Peek() is not '\n')
|
||||
{
|
||||
Next();
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
yield return new IdentifierToken(GetSourceFileSpan(tokenStartIndex), buffer);
|
||||
continue;
|
||||
Tokens.Add(ParseToken(current, _line, _column));
|
||||
}
|
||||
|
||||
if (char.IsDigit(current))
|
||||
{
|
||||
var isFloat = false;
|
||||
var buffer = string.Empty;
|
||||
|
||||
while (Peek() != null)
|
||||
{
|
||||
var next = Peek()!.Value;
|
||||
if (next == '.')
|
||||
{
|
||||
if (isFloat)
|
||||
{
|
||||
throw new Exception("More than one period found in float literal");
|
||||
}
|
||||
|
||||
isFloat = true;
|
||||
buffer += next;
|
||||
Next();
|
||||
}
|
||||
else if (char.IsDigit(next))
|
||||
{
|
||||
buffer += next;
|
||||
Next();
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), isFloat ? LiteralKind.Float : LiteralKind.Integer, buffer);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current == '"')
|
||||
catch (TokenizerException e)
|
||||
{
|
||||
Diagnostics.Add(e.Diagnostic);
|
||||
Next();
|
||||
var buffer = string.Empty;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (true)
|
||||
private Token ParseToken(char current, int lineStart, int columnStart)
|
||||
{
|
||||
if (char.IsLetter(current) || current == '_')
|
||||
{
|
||||
var buffer = string.Empty;
|
||||
|
||||
while (Peek() != null && (char.IsLetterOrDigit(Peek()!.Value) || Peek() == '_'))
|
||||
{
|
||||
buffer += Peek();
|
||||
Next();
|
||||
}
|
||||
|
||||
if (Keywords.TryGetValue(buffer, out var keywordSymbol))
|
||||
{
|
||||
return new SymbolToken(_fileName, CreateSpan(lineStart, columnStart), keywordSymbol);
|
||||
}
|
||||
|
||||
if (buffer is "true" or "false")
|
||||
{
|
||||
return new LiteralToken(_fileName, CreateSpan(lineStart, columnStart), LiteralKind.Bool, buffer);
|
||||
}
|
||||
|
||||
return new IdentifierToken(_fileName, CreateSpan(lineStart, columnStart), buffer);
|
||||
}
|
||||
|
||||
if (char.IsDigit(current))
|
||||
{
|
||||
var isFloat = false;
|
||||
var buffer = string.Empty;
|
||||
|
||||
while (Peek() != null)
|
||||
{
|
||||
var next = Peek()!.Value;
|
||||
if (next == '.')
|
||||
{
|
||||
if (Peek() == null)
|
||||
if (isFloat)
|
||||
{
|
||||
throw new Exception("Unclosed string literal");
|
||||
}
|
||||
|
||||
var next = Peek()!.Value;
|
||||
if (next == '"')
|
||||
{
|
||||
Next();
|
||||
break;
|
||||
throw new TokenizerException(Diagnostic
|
||||
.Error("More than one period found in float literal")
|
||||
.At(_fileName, _line, _column)
|
||||
.Build());
|
||||
}
|
||||
|
||||
isFloat = true;
|
||||
buffer += next;
|
||||
Next();
|
||||
}
|
||||
|
||||
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), LiteralKind.String, buffer);
|
||||
continue;
|
||||
}
|
||||
|
||||
var foundMatch = false;
|
||||
foreach (var (pattern, symbol) in OrderedSymbols)
|
||||
{
|
||||
for (var i = 0; i < pattern.Length; i++)
|
||||
else if (char.IsDigit(next))
|
||||
{
|
||||
var c = Peek(i);
|
||||
if (!c.HasValue || c.Value != pattern[i]) break;
|
||||
|
||||
if (i == pattern.Length - 1)
|
||||
{
|
||||
for (var j = 0; j <= i; j++)
|
||||
{
|
||||
Next();
|
||||
}
|
||||
|
||||
yield return new SymbolToken(GetSourceFileSpan(tokenStartIndex), symbol);
|
||||
foundMatch = true;
|
||||
break;
|
||||
}
|
||||
buffer += next;
|
||||
Next();
|
||||
}
|
||||
|
||||
if (foundMatch)
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (foundMatch)
|
||||
return new LiteralToken(_fileName, CreateSpan(lineStart, columnStart), isFloat ? LiteralKind.Float : LiteralKind.Integer, buffer);
|
||||
}
|
||||
|
||||
if (current == '"')
|
||||
{
|
||||
Next();
|
||||
var buffer = string.Empty;
|
||||
|
||||
while (true)
|
||||
{
|
||||
continue;
|
||||
var next = Peek();
|
||||
if (!next.HasValue)
|
||||
{
|
||||
throw new TokenizerException(Diagnostic
|
||||
.Error("Unclosed string literal")
|
||||
.At(_fileName, _line, _column)
|
||||
.Build());
|
||||
}
|
||||
|
||||
if (next is '\n')
|
||||
{
|
||||
_line += 1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (next is '"')
|
||||
{
|
||||
Next();
|
||||
break;
|
||||
}
|
||||
|
||||
buffer += next;
|
||||
Next();
|
||||
}
|
||||
|
||||
_diagnostics.Add(Diagnostic.Error($"Unknown token '{current}'").At(GetSourceFileSpan(tokenStartIndex)).Build());
|
||||
Next();
|
||||
return new LiteralToken(_fileName, CreateSpan(lineStart, columnStart), LiteralKind.String, buffer);
|
||||
}
|
||||
|
||||
foreach (var (pattern, symbol) in OrderedSymbols)
|
||||
{
|
||||
for (var i = 0; i < pattern.Length; i++)
|
||||
{
|
||||
var c = Peek(i);
|
||||
if (!c.HasValue || c.Value != pattern[i]) break;
|
||||
|
||||
if (i == pattern.Length - 1)
|
||||
{
|
||||
for (var j = 0; j <= i; j++)
|
||||
{
|
||||
Next();
|
||||
}
|
||||
|
||||
return new SymbolToken(_fileName, CreateSpan(lineStart, columnStart), symbol);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw new TokenizerException(Diagnostic.Error($"Unknown token '{current}'").Build());
|
||||
}
|
||||
|
||||
private SourceSpan CreateSpan(int lineStart, int columnStart)
|
||||
{
|
||||
return new SourceSpan(_fileName, new SourceLocation(lineStart, columnStart), new SourceLocation(_line, _column));
|
||||
}
|
||||
|
||||
private char? Peek(int offset = 0)
|
||||
{
|
||||
if (_index + offset < _sourceFile.GetText().Length)
|
||||
if (_index + offset < _content.Length)
|
||||
{
|
||||
return _sourceFile.GetText()[_index + offset];
|
||||
return _content[_index + offset];
|
||||
}
|
||||
|
||||
return null;
|
||||
@@ -240,34 +265,17 @@ public sealed class Tokenizer
|
||||
|
||||
private void Next()
|
||||
{
|
||||
_index++;
|
||||
_index += 1;
|
||||
_column += 1;
|
||||
}
|
||||
}
|
||||
|
||||
private SourceFileSpan GetSourceFileSpan(int tokenStartIndex)
|
||||
public class TokenizerException : Exception
|
||||
{
|
||||
public Diagnostic Diagnostic { get; }
|
||||
|
||||
public TokenizerException(Diagnostic diagnostic) : base(diagnostic.Message)
|
||||
{
|
||||
var start = CalculateSourceLocation(tokenStartIndex);
|
||||
var end = CalculateSourceLocation(_index);
|
||||
return new SourceFileSpan(_sourceFile, new SourceSpan(start, end));
|
||||
}
|
||||
|
||||
private SourceLocation CalculateSourceLocation(int index)
|
||||
{
|
||||
var line = 1;
|
||||
var column = 1;
|
||||
|
||||
for (var i = 0; i < index && i < _sourceFile.GetText().Length; i++)
|
||||
{
|
||||
if (_sourceFile.GetText()[i] == '\n')
|
||||
{
|
||||
line++;
|
||||
column = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
column++;
|
||||
}
|
||||
}
|
||||
|
||||
return new SourceLocation(line, column);
|
||||
Diagnostic = diagnostic;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user