This repository has been archived on 2025-10-24. You can view files and clone it, but cannot push or open issues or pull requests.
Files
nub-lang-archive-2/compiler/NubLang/Tokenization/Tokenizer.cs
nub31 fd27d2709d ...
2025-09-11 21:22:30 +02:00

266 lines
7.6 KiB
C#

using NubLang.Code;
using NubLang.Diagnostics;
namespace NubLang.Tokenization;
public sealed class Tokenizer
{
private static readonly Dictionary<string, Symbol> Keywords = new()
{
["func"] = Symbol.Func,
["if"] = Symbol.If,
["else"] = Symbol.Else,
["while"] = Symbol.While,
["break"] = Symbol.Break,
["continue"] = Symbol.Continue,
["return"] = Symbol.Return,
["struct"] = Symbol.Struct,
["let"] = Symbol.Let,
["calls"] = Symbol.Calls,
["interface"] = Symbol.Interface,
["for"] = Symbol.For,
["extern"] = Symbol.Extern,
["module"] = Symbol.Module,
["export"] = Symbol.Export,
["import"] = Symbol.Import,
};
private static readonly Dictionary<char[], Symbol> Symbols = new()
{
[['=', '=']] = Symbol.Equal,
[['!', '=']] = Symbol.NotEqual,
[['<', '=']] = Symbol.LessThanOrEqual,
[['>', '=']] = Symbol.GreaterThanOrEqual,
[['<', '<']] = Symbol.LeftShift,
[['>', '>']] = Symbol.RightShift,
[['&', '&']] = Symbol.And,
[['|', '|']] = Symbol.Or,
[[':']] = Symbol.Colon,
[['(']] = Symbol.OpenParen,
[[')']] = Symbol.CloseParen,
[['{']] = Symbol.OpenBrace,
[['}']] = Symbol.CloseBrace,
[['[']] = Symbol.OpenBracket,
[[']']] = Symbol.CloseBracket,
[[',']] = Symbol.Comma,
[['.']] = Symbol.Period,
[['=']] = Symbol.Assign,
[['<']] = Symbol.LessThan,
[['>']] = Symbol.GreaterThan,
[['+']] = Symbol.Plus,
[['-']] = Symbol.Minus,
[['*']] = Symbol.Star,
[['/']] = Symbol.ForwardSlash,
[['!']] = Symbol.Bang,
[['^']] = Symbol.Caret,
[['&']] = Symbol.Ampersand,
[[';']] = Symbol.Semi,
[['%']] = Symbol.Percent,
[['|']] = Symbol.Pipe,
};
private static readonly (char[] Pattern, Symbol Symbol)[] OrderedSymbols = Symbols
.OrderByDescending(kvp => kvp.Key.Length)
.Select(kvp => (kvp.Key, kvp.Value))
.ToArray();
private readonly SourceFile _sourceFile;
private readonly List<Diagnostic> _diagnostics = [];
private int _index;
public Tokenizer(SourceFile sourceFile)
{
_sourceFile = sourceFile;
}
public IReadOnlyList<Diagnostic> GetDiagnostics() => _diagnostics;
public IEnumerable<Token> Tokenize()
{
_index = 0;
while (Peek().TryGetValue(out var current))
{
if (char.IsWhiteSpace(current))
{
Next();
continue;
}
if (current == '/' && Peek(1).TryGetValue(out var nextChar) && nextChar == '/')
{
while (Peek().TryGetValue(out var ch) && ch != '\n')
{
Next();
}
continue;
}
var tokenStartIndex = _index;
if (char.IsLetter(current) || current == '_')
{
var buffer = string.Empty;
while (Peek().TryGetValue(out var next) && (char.IsLetterOrDigit(next) || next == '_'))
{
buffer += next;
Next();
}
if (Keywords.TryGetValue(buffer, out var keywordSymbol))
{
yield return new SymbolToken(GetSourceFileSpan(tokenStartIndex), keywordSymbol);
continue;
}
if (buffer is "true" or "false")
{
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), LiteralKind.Bool, buffer);
continue;
}
yield return new IdentifierToken(GetSourceFileSpan(tokenStartIndex), buffer);
continue;
}
if (char.IsDigit(current))
{
var isFloat = false;
var buffer = string.Empty;
while (Peek().TryGetValue(out var next))
{
if (next == '.')
{
if (isFloat)
{
throw new Exception("More than one period found in float literal");
}
isFloat = true;
buffer += next;
Next();
}
else if (char.IsDigit(next))
{
buffer += next;
Next();
}
else
{
break;
}
}
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), isFloat ? LiteralKind.Float : LiteralKind.Integer, buffer);
continue;
}
if (current == '"')
{
Next();
var buffer = string.Empty;
while (true)
{
if (!Peek().TryGetValue(out var next))
{
throw new Exception("Unclosed string literal");
}
if (next == '"')
{
Next();
break;
}
buffer += next;
Next();
}
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), LiteralKind.String, buffer);
continue;
}
var foundMatch = false;
foreach (var (pattern, symbol) in OrderedSymbols)
{
for (var i = 0; i < pattern.Length; i++)
{
var c = Peek(i);
if (!c.HasValue || c.Value != pattern[i]) break;
if (i == pattern.Length - 1)
{
for (var j = 0; j <= i; j++)
{
Next();
}
yield return new SymbolToken(GetSourceFileSpan(tokenStartIndex), symbol);
foundMatch = true;
break;
}
}
if (foundMatch)
{
break;
}
}
if (foundMatch)
{
continue;
}
_diagnostics.Add(Diagnostic.Error($"Unknown token '{current}'").At(GetSourceFileSpan(tokenStartIndex)).Build());
Next();
}
}
private Optional<char> Peek(int offset = 0)
{
if (_index + offset < _sourceFile.GetText().Length)
{
return _sourceFile.GetText()[_index + offset];
}
return Optional<char>.Empty();
}
private void Next()
{
_index++;
}
private SourceFileSpan GetSourceFileSpan(int tokenStartIndex)
{
var start = CalculateSourceLocation(tokenStartIndex);
var end = CalculateSourceLocation(_index);
return new SourceFileSpan(_sourceFile, new SourceSpan(start, end));
}
private SourceLocation CalculateSourceLocation(int index)
{
var line = 1;
var column = 1;
for (var i = 0; i < index && i < _sourceFile.GetText().Length; i++)
{
if (_sourceFile.GetText()[i] == '\n')
{
line++;
column = 1;
}
else
{
column++;
}
}
return new SourceLocation(line, column);
}
}