...
This commit is contained in:
266
compiler/NubLang/Tokenization/Tokenizer.cs
Normal file
266
compiler/NubLang/Tokenization/Tokenizer.cs
Normal file
@@ -0,0 +1,266 @@
|
||||
using NubLang.Code;
|
||||
using NubLang.Diagnostics;
|
||||
|
||||
namespace NubLang.Tokenization;
|
||||
|
||||
public sealed class Tokenizer
|
||||
{
|
||||
private static readonly Dictionary<string, Symbol> Keywords = new()
|
||||
{
|
||||
["func"] = Symbol.Func,
|
||||
["if"] = Symbol.If,
|
||||
["else"] = Symbol.Else,
|
||||
["while"] = Symbol.While,
|
||||
["break"] = Symbol.Break,
|
||||
["continue"] = Symbol.Continue,
|
||||
["return"] = Symbol.Return,
|
||||
["struct"] = Symbol.Struct,
|
||||
["let"] = Symbol.Let,
|
||||
["calls"] = Symbol.Calls,
|
||||
["interface"] = Symbol.Interface,
|
||||
["for"] = Symbol.For,
|
||||
["extern"] = Symbol.Extern,
|
||||
["module"] = Symbol.Module,
|
||||
["export"] = Symbol.Export,
|
||||
["import"] = Symbol.Import,
|
||||
};
|
||||
|
||||
private static readonly Dictionary<char[], Symbol> Symbols = new()
|
||||
{
|
||||
[['=', '=']] = Symbol.Equal,
|
||||
[['!', '=']] = Symbol.NotEqual,
|
||||
[['<', '=']] = Symbol.LessThanOrEqual,
|
||||
[['>', '=']] = Symbol.GreaterThanOrEqual,
|
||||
[['<', '<']] = Symbol.LeftShift,
|
||||
[['>', '>']] = Symbol.RightShift,
|
||||
[['&', '&']] = Symbol.And,
|
||||
[['|', '|']] = Symbol.Or,
|
||||
[[':']] = Symbol.Colon,
|
||||
[['(']] = Symbol.OpenParen,
|
||||
[[')']] = Symbol.CloseParen,
|
||||
[['{']] = Symbol.OpenBrace,
|
||||
[['}']] = Symbol.CloseBrace,
|
||||
[['[']] = Symbol.OpenBracket,
|
||||
[[']']] = Symbol.CloseBracket,
|
||||
[[',']] = Symbol.Comma,
|
||||
[['.']] = Symbol.Period,
|
||||
[['=']] = Symbol.Assign,
|
||||
[['<']] = Symbol.LessThan,
|
||||
[['>']] = Symbol.GreaterThan,
|
||||
[['+']] = Symbol.Plus,
|
||||
[['-']] = Symbol.Minus,
|
||||
[['*']] = Symbol.Star,
|
||||
[['/']] = Symbol.ForwardSlash,
|
||||
[['!']] = Symbol.Bang,
|
||||
[['^']] = Symbol.Caret,
|
||||
[['&']] = Symbol.Ampersand,
|
||||
[[';']] = Symbol.Semi,
|
||||
[['%']] = Symbol.Percent,
|
||||
[['|']] = Symbol.Pipe,
|
||||
};
|
||||
|
||||
private static readonly (char[] Pattern, Symbol Symbol)[] OrderedSymbols = Symbols
|
||||
.OrderByDescending(kvp => kvp.Key.Length)
|
||||
.Select(kvp => (kvp.Key, kvp.Value))
|
||||
.ToArray();
|
||||
|
||||
private readonly SourceFile _sourceFile;
|
||||
private readonly List<Diagnostic> _diagnostics = [];
|
||||
private int _index;
|
||||
|
||||
public Tokenizer(SourceFile sourceFile)
|
||||
{
|
||||
_sourceFile = sourceFile;
|
||||
}
|
||||
|
||||
public IReadOnlyList<Diagnostic> GetDiagnostics() => _diagnostics;
|
||||
|
||||
public IEnumerable<Token> Tokenize()
|
||||
{
|
||||
_index = 0;
|
||||
|
||||
while (Peek().TryGetValue(out var current))
|
||||
{
|
||||
if (char.IsWhiteSpace(current))
|
||||
{
|
||||
Next();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current == '/' && Peek(1).TryGetValue(out var nextChar) && nextChar == '/')
|
||||
{
|
||||
while (Peek().TryGetValue(out var ch) && ch != '\n')
|
||||
{
|
||||
Next();
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
var tokenStartIndex = _index;
|
||||
|
||||
if (char.IsLetter(current) || current == '_')
|
||||
{
|
||||
var buffer = string.Empty;
|
||||
|
||||
while (Peek().TryGetValue(out var next) && (char.IsLetterOrDigit(next) || next == '_'))
|
||||
{
|
||||
buffer += next;
|
||||
Next();
|
||||
}
|
||||
|
||||
if (Keywords.TryGetValue(buffer, out var keywordSymbol))
|
||||
{
|
||||
yield return new SymbolToken(GetSourceFileSpan(tokenStartIndex), keywordSymbol);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (buffer is "true" or "false")
|
||||
{
|
||||
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), LiteralKind.Bool, buffer);
|
||||
continue;
|
||||
}
|
||||
|
||||
yield return new IdentifierToken(GetSourceFileSpan(tokenStartIndex), buffer);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (char.IsDigit(current))
|
||||
{
|
||||
var isFloat = false;
|
||||
var buffer = string.Empty;
|
||||
|
||||
while (Peek().TryGetValue(out var next))
|
||||
{
|
||||
if (next == '.')
|
||||
{
|
||||
if (isFloat)
|
||||
{
|
||||
throw new Exception("More than one period found in float literal");
|
||||
}
|
||||
|
||||
isFloat = true;
|
||||
buffer += next;
|
||||
Next();
|
||||
}
|
||||
else if (char.IsDigit(next))
|
||||
{
|
||||
buffer += next;
|
||||
Next();
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), isFloat ? LiteralKind.Float : LiteralKind.Integer, buffer);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current == '"')
|
||||
{
|
||||
Next();
|
||||
var buffer = string.Empty;
|
||||
|
||||
while (true)
|
||||
{
|
||||
if (!Peek().TryGetValue(out var next))
|
||||
{
|
||||
throw new Exception("Unclosed string literal");
|
||||
}
|
||||
|
||||
if (next == '"')
|
||||
{
|
||||
Next();
|
||||
break;
|
||||
}
|
||||
|
||||
buffer += next;
|
||||
Next();
|
||||
}
|
||||
|
||||
yield return new LiteralToken(GetSourceFileSpan(tokenStartIndex), LiteralKind.String, buffer);
|
||||
continue;
|
||||
}
|
||||
|
||||
var foundMatch = false;
|
||||
foreach (var (pattern, symbol) in OrderedSymbols)
|
||||
{
|
||||
for (var i = 0; i < pattern.Length; i++)
|
||||
{
|
||||
var c = Peek(i);
|
||||
if (!c.HasValue || c.Value != pattern[i]) break;
|
||||
|
||||
if (i == pattern.Length - 1)
|
||||
{
|
||||
for (var j = 0; j <= i; j++)
|
||||
{
|
||||
Next();
|
||||
}
|
||||
|
||||
yield return new SymbolToken(GetSourceFileSpan(tokenStartIndex), symbol);
|
||||
foundMatch = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (foundMatch)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (foundMatch)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
_diagnostics.Add(Diagnostic.Error($"Unknown token '{current}'").At(GetSourceFileSpan(tokenStartIndex)).Build());
|
||||
Next();
|
||||
}
|
||||
}
|
||||
|
||||
private Optional<char> Peek(int offset = 0)
|
||||
{
|
||||
if (_index + offset < _sourceFile.GetText().Length)
|
||||
{
|
||||
return _sourceFile.GetText()[_index + offset];
|
||||
}
|
||||
|
||||
return Optional<char>.Empty();
|
||||
}
|
||||
|
||||
private void Next()
|
||||
{
|
||||
_index++;
|
||||
}
|
||||
|
||||
private SourceFileSpan GetSourceFileSpan(int tokenStartIndex)
|
||||
{
|
||||
var start = CalculateSourceLocation(tokenStartIndex);
|
||||
var end = CalculateSourceLocation(_index);
|
||||
return new SourceFileSpan(_sourceFile, new SourceSpan(start, end));
|
||||
}
|
||||
|
||||
private SourceLocation CalculateSourceLocation(int index)
|
||||
{
|
||||
var line = 1;
|
||||
var column = 1;
|
||||
|
||||
for (var i = 0; i < index && i < _sourceFile.GetText().Length; i++)
|
||||
{
|
||||
if (_sourceFile.GetText()[i] == '\n')
|
||||
{
|
||||
line++;
|
||||
column = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
column++;
|
||||
}
|
||||
}
|
||||
|
||||
return new SourceLocation(line, column);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user