Perf improvements in tokenizer

This commit is contained in:
nub31
2025-09-29 14:11:48 +02:00
parent b5799633bf
commit c74394a849
10 changed files with 219 additions and 274 deletions

View File

@@ -1,9 +1,7 @@
using NubLang.Code;
namespace NubLang.CLI;
public class Options
{
public string? OutputPath { get; set; }
public List<SourceFile> Files { get; } = [];
public List<string> Files { get; } = [];
}

View File

@@ -1,6 +1,5 @@
using System.Diagnostics;
using NubLang.CLI;
using NubLang.Code;
using NubLang.Diagnostics;
using NubLang.Generation.QBE;
using NubLang.Modules;
@@ -32,7 +31,7 @@ for (var i = 0; i < args.Length; i++)
}
default:
{
options.Files.Add(new SourceFile(arg));
options.Files.Add(arg);
break;
}
}
@@ -43,7 +42,7 @@ sw.Restart();
foreach (var file in options.Files)
{
if (!File.Exists(file.Path))
if (!File.Exists(file))
{
Console.Error.WriteLine($"File '{file}' does not exist");
return 1;
@@ -58,18 +57,18 @@ var diagnostics = new List<Diagnostic>();
var syntaxTrees = new List<SyntaxTree>();
foreach (var file in options.Files)
{
var tokenizer = new Tokenizer(file);
var tokens = tokenizer.Tokenize().ToList();
diagnostics.AddRange(tokenizer.GetDiagnostics());
var tokenizer = new Tokenizer(file, File.ReadAllText(file));
tokenizer.Tokenize();
diagnostics.AddRange(tokenizer.Diagnostics);
Console.WriteLine($"Tokenize: {Path.GetFileName(file.Path)}: {sw.ElapsedMilliseconds}ms");
Console.WriteLine($" Tokenize: {Path.GetFileName(file)}: {sw.ElapsedMilliseconds}ms");
sw.Restart();
var parser = new Parser();
var syntaxTree = parser.Parse(tokens);
diagnostics.AddRange(parser.GetDiagnostics());
var syntaxTree = parser.Parse(tokenizer.Tokens);
diagnostics.AddRange(parser.Diagnostics);
Console.WriteLine($"Parse: {Path.GetFileName(file.Path)}: {sw.ElapsedMilliseconds}ms");
Console.WriteLine($" Parse: {Path.GetFileName(file)}: {sw.ElapsedMilliseconds}ms");
sw.Restart();
syntaxTrees.Add(syntaxTree);
@@ -91,7 +90,7 @@ foreach (var syntaxTree in syntaxTrees)
var typeChecker = new TypeChecker(syntaxTree, moduleRepository);
typeChecker.Check();
Console.WriteLine($"Type check {syntaxTree.Metadata.ModuleName}: {sw.ElapsedMilliseconds}ms");
Console.WriteLine($" Type check {syntaxTree.Metadata.ModuleName}: {sw.ElapsedMilliseconds}ms");
sw.Restart();
definitions.AddRange(typeChecker.Definitions);

View File

@@ -1,41 +0,0 @@
namespace NubLang.Code;
public class SourceFile
{
private string? _content;
public SourceFile(string path)
{
Path = path ?? throw new ArgumentNullException(nameof(path));
}
public string Path { get; }
public string GetText() => _content ??= File.ReadAllText(Path);
public override string ToString() => Path;
public override bool Equals(object? obj)
{
return obj is SourceFile other && other.Path == Path;
}
public override int GetHashCode()
{
return HashCode.Combine(typeof(SourceFile), Path);
}
public static bool operator ==(SourceFile? left, SourceFile? right) => Equals(left, right);
public static bool operator !=(SourceFile? left, SourceFile? right) => !Equals(left, right);
}
public class SourceFileSpan
{
public SourceFileSpan(SourceFile sourceFile, SourceSpan span)
{
SourceFile = sourceFile;
Span = span;
}
public SourceFile SourceFile { get; }
public SourceSpan Span { get; }
}

View File

@@ -2,8 +2,6 @@ namespace NubLang.Code;
public readonly struct SourceLocation : IEquatable<SourceLocation>, IComparable<SourceLocation>
{
public static SourceLocation Zero => new(0, 0);
public SourceLocation(int line, int column)
{
Line = line;

View File

@@ -2,34 +2,33 @@ namespace NubLang.Code;
public readonly struct SourceSpan : IEquatable<SourceSpan>, IComparable<SourceSpan>
{
public static SourceSpan Zero => new(SourceLocation.Zero, SourceLocation.Zero);
public static SourceSpan Merge(params IEnumerable<SourceSpan> spans)
{
var spanArray = spans as SourceSpan[] ?? spans.ToArray();
if (spanArray.Length == 0)
{
return Zero;
return new SourceSpan(string.Empty, new SourceLocation(0, 0), new SourceLocation(0, 0));
}
var minStart = spanArray.Min(s => s.Start);
var maxEnd = spanArray.Max(s => s.End);
return new SourceSpan(minStart, maxEnd);
return new SourceSpan(spanArray[0].FilePath, minStart, maxEnd);
}
public SourceSpan(SourceLocation start, SourceLocation end)
public SourceSpan(string filePath, SourceLocation start, SourceLocation end)
{
if (start > end)
{
throw new ArgumentException("Start location cannot be after end location");
}
FilePath = filePath;
Start = start;
End = end;
}
public string FilePath { get; }
public SourceLocation Start { get; }
public SourceLocation End { get; }
@@ -37,15 +36,15 @@ public readonly struct SourceSpan : IEquatable<SourceSpan>, IComparable<SourceSp
{
if (Start == End)
{
return $"{Start}";
return $"{FilePath}:{Start}";
}
if (Start.Line == End.Line)
{
return Start.Column == End.Column ? $"{Start}" : $"{Start.Line}:{Start.Column}-{End.Column}";
return Start.Column == End.Column ? $"{FilePath}:{Start}" : $"{FilePath}:{Start.Line}:{Start.Column}-{End.Column}";
}
return $"{Start}-{End}";
return $"{FilePath}:{Start}-{End}";
}
public bool Equals(SourceSpan other) => Start == other.Start && End == other.End;
@@ -54,7 +53,7 @@ public readonly struct SourceSpan : IEquatable<SourceSpan>, IComparable<SourceSp
public static bool operator ==(SourceSpan left, SourceSpan right) => Equals(left, right);
public static bool operator !=(SourceSpan left, SourceSpan right) => !Equals(left, right);
public static bool operator <(SourceSpan left, SourceSpan right) => left.CompareTo(right) < 0;
public static bool operator <=(SourceSpan left, SourceSpan right) => left.CompareTo(right) <= 0;
public static bool operator >(SourceSpan left, SourceSpan right) => left.CompareTo(right) > 0;

View File

@@ -11,7 +11,7 @@ public class Diagnostic
{
private readonly DiagnosticSeverity _severity;
private readonly string _message;
private SourceFileSpan? _fileSpan;
private SourceSpan? _span;
private string? _help;
public DiagnosticBuilder(DiagnosticSeverity severity, string message)
@@ -24,12 +24,7 @@ public class Diagnostic
{
if (node != null)
{
var first = node.Tokens.FirstOrDefault();
if (first?.FileSpan != null)
{
var span = SourceSpan.Merge(node.Tokens.Select(x => x.FileSpan.Span));
At(new SourceFileSpan(first.FileSpan.SourceFile, span));
}
_span = SourceSpan.Merge(node.Tokens.Select(x => x.Span));
}
return this;
@@ -39,29 +34,35 @@ public class Diagnostic
{
if (token != null)
{
At(token.FileSpan);
At(token.Span);
}
return this;
}
public DiagnosticBuilder At(SourceFileSpan? fileSpan)
public DiagnosticBuilder At(SourceSpan? span)
{
if (fileSpan != null)
if (span != null)
{
_fileSpan = fileSpan;
_span = span;
}
return this;
}
public DiagnosticBuilder At(string filePath, int line, int column)
{
_span = new SourceSpan(filePath, new SourceLocation(line, column), new SourceLocation(line, column));
return this;
}
public DiagnosticBuilder WithHelp(string help)
{
_help = help;
return this;
}
public Diagnostic Build() => new(_severity, _message, _help, _fileSpan);
public Diagnostic Build() => new(_severity, _message, _help, _span);
}
public static DiagnosticBuilder Error(string message) => new(DiagnosticSeverity.Error, message);
@@ -71,14 +72,14 @@ public class Diagnostic
public DiagnosticSeverity Severity { get; }
public string Message { get; }
public string? Help { get; }
public SourceFileSpan? FileSpan { get; }
public SourceSpan? Span { get; }
private Diagnostic(DiagnosticSeverity severity, string message, string? help, SourceFileSpan? fileSpan)
private Diagnostic(DiagnosticSeverity severity, string message, string? help, SourceSpan? span)
{
Severity = severity;
Message = message;
Help = help;
FileSpan = fileSpan;
Span = span;
}
public string FormatANSI()
@@ -93,23 +94,23 @@ public class Diagnostic
_ => ConsoleColors.Colorize("unknown", ConsoleColors.Bold + ConsoleColors.White)
});
if (FileSpan != null)
if (Span.HasValue)
{
sb.Append(ConsoleColors.Colorize($" at {FileSpan.SourceFile.Path}:{FileSpan.Span}", ConsoleColors.Faint));
sb.Append(ConsoleColors.Colorize($" at {Span.Value}", ConsoleColors.Faint));
}
sb.Append(": ");
sb.Append(ConsoleColors.Colorize(Message, ConsoleColors.BrightWhite));
if (FileSpan != null)
if (Span.HasValue)
{
sb.AppendLine();
var text = FileSpan.SourceFile.GetText();
var text = File.ReadAllText(Span.Value.FilePath);
var lines = text.Split('\n');
var startLine = FileSpan.Span.Start.Line;
var endLine = FileSpan.Span.End.Line;
var startLine = Span.Value.Start.Line;
var endLine = Span.Value.End.Line;
const int CONTEXT_LINES = 3;
@@ -126,8 +127,8 @@ public class Diagnostic
sb.Append('╮');
sb.AppendLine();
var tokenizer = new Tokenizer(FileSpan.SourceFile);
var tokens = tokenizer.Tokenize().ToList();
var tokenizer = new Tokenizer(Span.Value.FilePath, text);
tokenizer.Tokenize();
for (var i = contextStartLine; i <= contextEndLine; i++)
{
@@ -136,7 +137,7 @@ public class Diagnostic
sb.Append("│ ");
sb.Append(i.ToString().PadRight(numberPadding));
sb.Append(" │ ");
sb.Append(ApplySyntaxHighlighting(line.PadRight(codePadding), i, tokens));
sb.Append(ApplySyntaxHighlighting(line.PadRight(codePadding), i, tokenizer.Tokens));
sb.Append(" │");
sb.AppendLine();
@@ -147,12 +148,12 @@ public class Diagnostic
if (i == startLine)
{
markerStartColumn = FileSpan.Span.Start.Column;
markerStartColumn = Span.Value.Start.Column;
}
if (i == endLine)
{
markerEndColumn = FileSpan.Span.End.Column;
markerEndColumn = Span.Value.End.Column;
}
var markerLength = markerEndColumn - markerStartColumn;
@@ -197,8 +198,8 @@ public class Diagnostic
{
var sb = new StringBuilder();
var lineTokens = tokens
.Where(t => t.FileSpan.Span.Start.Line == lineNumber)
.OrderBy(t => t.FileSpan.Span.Start.Column)
.Where(t => t.Span.Start.Line == lineNumber)
.OrderBy(t => t.Span.Start.Column)
.ToList();
if (lineTokens.Count == 0)
@@ -210,8 +211,8 @@ public class Diagnostic
foreach (var token in lineTokens)
{
var tokenStart = token.FileSpan.Span.Start.Column;
var tokenEnd = token.FileSpan.Span.End.Column;
var tokenStart = token.Span.Start.Column;
var tokenEnd = token.Span.End.Column;
if (tokenStart > currentColumn)
{

View File

@@ -7,7 +7,6 @@ namespace NubLang.Parsing;
public sealed class Parser
{
private readonly List<Diagnostic> _diagnostics = [];
private readonly HashSet<string> _templateArguments = [];
private List<Token> _tokens = [];
private int _tokenIndex;
@@ -16,14 +15,11 @@ public sealed class Parser
private Token? CurrentToken => _tokenIndex < _tokens.Count ? _tokens[_tokenIndex] : null;
private bool HasToken => CurrentToken != null;
public List<Diagnostic> GetDiagnostics()
{
return _diagnostics;
}
public List<Diagnostic> Diagnostics { get; } = [];
public SyntaxTree Parse(List<Token> tokens)
{
_diagnostics.Clear();
Diagnostics.Clear();
_tokens = tokens;
_tokenIndex = 0;
_moduleName = string.Empty;
@@ -51,7 +47,7 @@ public sealed class Parser
}
catch (ParseException e)
{
_diagnostics.Add(e.Diagnostic);
Diagnostics.Add(e.Diagnostic);
while (HasToken)
{
if (CurrentToken is SymbolToken { Symbol: Symbol.Module or Symbol.Import })
@@ -102,7 +98,7 @@ public sealed class Parser
}
catch (ParseException e)
{
_diagnostics.Add(e.Diagnostic);
Diagnostics.Add(e.Diagnostic);
while (HasToken)
{
if (CurrentToken is SymbolToken { Symbol: Symbol.Extern or Symbol.Func or Symbol.Struct })
@@ -692,7 +688,7 @@ public sealed class Parser
}
catch (ParseException ex)
{
_diagnostics.Add(ex.Diagnostic);
Diagnostics.Add(ex.Diagnostic);
if (HasToken)
{
Next();

View File

@@ -2,22 +2,6 @@
namespace NubLang.Tokenization;
public abstract class Token(SourceFileSpan fileSpan)
{
public SourceFileSpan FileSpan { get; } = fileSpan;
}
public class IdentifierToken(SourceFileSpan fileSpan, string value) : Token(fileSpan)
{
public string Value { get; } = value;
}
public class LiteralToken(SourceFileSpan fileSpan, LiteralKind kind, string value) : Token(fileSpan)
{
public LiteralKind Kind { get; } = kind;
public string Value { get; } = value;
}
public enum LiteralKind
{
Integer,
@@ -26,11 +10,6 @@ public enum LiteralKind
Bool
}
public class SymbolToken(SourceFileSpan fileSpan, Symbol symbol) : Token(fileSpan)
{
public Symbol Symbol { get; } = symbol;
}
public enum Symbol
{
Func,
@@ -83,4 +62,12 @@ public enum Symbol
Defer,
At,
Enum,
}
}
public abstract record Token(string FileName, SourceSpan Span);
public record IdentifierToken(string FileName, SourceSpan Span, string Value) : Token(FileName, Span);
public record LiteralToken(string FileName, SourceSpan Span, LiteralKind Kind, string Value) : Token(FileName, Span);
public record SymbolToken(string FileName, SourceSpan Span, Symbol Symbol) : Token(FileName, Span);

View File

@@ -68,171 +68,196 @@ public sealed class Tokenizer
.Select(kvp => (kvp.Key, kvp.Value))
.ToArray();
private readonly SourceFile _sourceFile;
private readonly List<Diagnostic> _diagnostics = [];
private int _index;
private readonly string _fileName;
private readonly string _content;
private int _index = 0;
private int _line = 1;
private int _column = 1;
public Tokenizer(SourceFile sourceFile)
public Tokenizer(string fileName, string content)
{
_sourceFile = sourceFile;
_fileName = fileName;
_content = content;
}
public List<Diagnostic> GetDiagnostics() => _diagnostics;
public List<Diagnostic> Diagnostics { get; } = [];
public List<Token> Tokens { get; } = [];
public IEnumerable<Token> Tokenize()
public void Tokenize()
{
Diagnostics.Clear();
Tokens.Clear();
_index = 0;
_line = 1;
_column = 1;
while (Peek() != null)
while (Peek().HasValue)
{
var current = Peek()!.Value;
if (char.IsWhiteSpace(current))
try
{
Next();
continue;
}
if (current == '/' && Peek(1) == '/')
{
while (Peek().HasValue && Peek() != '\n')
// Skip whitespace and increment line counter if newline
var current = Peek()!.Value;
if (char.IsWhiteSpace(current))
{
if (current is '\n')
{
_line += 1;
_column = 1;
}
Next();
}
continue;
}
var tokenStartIndex = _index;
if (char.IsLetter(current) || current == '_')
{
var buffer = string.Empty;
while (Peek() != null && (char.IsLetterOrDigit(Peek()!.Value) || Peek() == '_'))
{
buffer += Peek();
Next();
}
if (Keywords.TryGetValue(buffer, out var keywordSymbol))
{
yield return new SymbolToken(GetSourceFileSpan(tokenStartIndex), keywordSymbol);
continue;
}
if (buffer is "true" or "false")
// Skip single line comments but keep newline so next iteration increments the line counter
if (current == '/' && Peek(1) == '/')
{
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), LiteralKind.Bool, buffer);
while (Peek() is not '\n')
{
Next();
}
continue;
}
yield return new IdentifierToken(GetSourceFileSpan(tokenStartIndex), buffer);
continue;
Tokens.Add(ParseToken(current, _line, _column));
}
if (char.IsDigit(current))
{
var isFloat = false;
var buffer = string.Empty;
while (Peek() != null)
{
var next = Peek()!.Value;
if (next == '.')
{
if (isFloat)
{
throw new Exception("More than one period found in float literal");
}
isFloat = true;
buffer += next;
Next();
}
else if (char.IsDigit(next))
{
buffer += next;
Next();
}
else
{
break;
}
}
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), isFloat ? LiteralKind.Float : LiteralKind.Integer, buffer);
continue;
}
if (current == '"')
catch (TokenizerException e)
{
Diagnostics.Add(e.Diagnostic);
Next();
var buffer = string.Empty;
}
}
}
while (true)
private Token ParseToken(char current, int lineStart, int columnStart)
{
if (char.IsLetter(current) || current == '_')
{
var buffer = string.Empty;
while (Peek() != null && (char.IsLetterOrDigit(Peek()!.Value) || Peek() == '_'))
{
buffer += Peek();
Next();
}
if (Keywords.TryGetValue(buffer, out var keywordSymbol))
{
return new SymbolToken(_fileName, CreateSpan(lineStart, columnStart), keywordSymbol);
}
if (buffer is "true" or "false")
{
return new LiteralToken(_fileName, CreateSpan(lineStart, columnStart), LiteralKind.Bool, buffer);
}
return new IdentifierToken(_fileName, CreateSpan(lineStart, columnStart), buffer);
}
if (char.IsDigit(current))
{
var isFloat = false;
var buffer = string.Empty;
while (Peek() != null)
{
var next = Peek()!.Value;
if (next == '.')
{
if (Peek() == null)
if (isFloat)
{
throw new Exception("Unclosed string literal");
}
var next = Peek()!.Value;
if (next == '"')
{
Next();
break;
throw new TokenizerException(Diagnostic
.Error("More than one period found in float literal")
.At(_fileName, _line, _column)
.Build());
}
isFloat = true;
buffer += next;
Next();
}
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), LiteralKind.String, buffer);
continue;
}
var foundMatch = false;
foreach (var (pattern, symbol) in OrderedSymbols)
{
for (var i = 0; i < pattern.Length; i++)
else if (char.IsDigit(next))
{
var c = Peek(i);
if (!c.HasValue || c.Value != pattern[i]) break;
if (i == pattern.Length - 1)
{
for (var j = 0; j <= i; j++)
{
Next();
}
yield return new SymbolToken(GetSourceFileSpan(tokenStartIndex), symbol);
foundMatch = true;
break;
}
buffer += next;
Next();
}
if (foundMatch)
else
{
break;
}
}
if (foundMatch)
return new LiteralToken(_fileName, CreateSpan(lineStart, columnStart), isFloat ? LiteralKind.Float : LiteralKind.Integer, buffer);
}
if (current == '"')
{
Next();
var buffer = string.Empty;
while (true)
{
continue;
var next = Peek();
if (!next.HasValue)
{
throw new TokenizerException(Diagnostic
.Error("Unclosed string literal")
.At(_fileName, _line, _column)
.Build());
}
if (next is '\n')
{
_line += 1;
break;
}
if (next is '"')
{
Next();
break;
}
buffer += next;
Next();
}
_diagnostics.Add(Diagnostic.Error($"Unknown token '{current}'").At(GetSourceFileSpan(tokenStartIndex)).Build());
Next();
return new LiteralToken(_fileName, CreateSpan(lineStart, columnStart), LiteralKind.String, buffer);
}
foreach (var (pattern, symbol) in OrderedSymbols)
{
for (var i = 0; i < pattern.Length; i++)
{
var c = Peek(i);
if (!c.HasValue || c.Value != pattern[i]) break;
if (i == pattern.Length - 1)
{
for (var j = 0; j <= i; j++)
{
Next();
}
return new SymbolToken(_fileName, CreateSpan(lineStart, columnStart), symbol);
}
}
}
throw new TokenizerException(Diagnostic.Error($"Unknown token '{current}'").Build());
}
private SourceSpan CreateSpan(int lineStart, int columnStart)
{
return new SourceSpan(_fileName, new SourceLocation(lineStart, columnStart), new SourceLocation(_line, _column));
}
private char? Peek(int offset = 0)
{
if (_index + offset < _sourceFile.GetText().Length)
if (_index + offset < _content.Length)
{
return _sourceFile.GetText()[_index + offset];
return _content[_index + offset];
}
return null;
@@ -240,34 +265,17 @@ public sealed class Tokenizer
private void Next()
{
_index++;
_index += 1;
_column += 1;
}
}
private SourceFileSpan GetSourceFileSpan(int tokenStartIndex)
public class TokenizerException : Exception
{
public Diagnostic Diagnostic { get; }
public TokenizerException(Diagnostic diagnostic) : base(diagnostic.Message)
{
var start = CalculateSourceLocation(tokenStartIndex);
var end = CalculateSourceLocation(_index);
return new SourceFileSpan(_sourceFile, new SourceSpan(start, end));
}
private SourceLocation CalculateSourceLocation(int index)
{
var line = 1;
var column = 1;
for (var i = 0; i < index && i < _sourceFile.GetText().Length; i++)
{
if (_sourceFile.GetText()[i] == '\n')
{
line++;
column = 1;
}
else
{
column++;
}
}
return new SourceLocation(line, column);
Diagnostic = diagnostic;
}
}

View File

@@ -6,7 +6,7 @@ out: .build/out.o
.build/out.o: $(NUBC) src/main.nub src/raylib.nub
$(NUBC) src/main.nub src/raylib.nub
# .PHONY: $(NUBC)
.PHONY: $(NUBC)
$(NUBC):
dotnet build ../compiler/NubLang.CLI/NubLang.CLI.csproj