Files
nub-lang/src/Nub.Lang.Syntax/Tokenization/Tokenizer.cs
nub31 705c83c935 ...
2025-06-12 23:52:52 +02:00

279 lines
7.5 KiB
C#

using Nub.Lang.Common;
using Nub.Lang.Syntax.Diagnostics;
namespace Nub.Lang.Syntax.Tokenization;
public static class Tokenizer
{
private static readonly Dictionary<string, Symbol> Keywords = new()
{
["namespace"] = Symbol.Namespace,
["func"] = Symbol.Func,
["if"] = Symbol.If,
["else"] = Symbol.Else,
["while"] = Symbol.While,
["break"] = Symbol.Break,
["continue"] = Symbol.Continue,
["return"] = Symbol.Return,
["alloc"] = Symbol.Alloc,
["struct"] = Symbol.Struct,
["let"] = Symbol.Let,
["calls"] = Symbol.Calls,
};
private static readonly Dictionary<string, Modifier> Modifiers = new()
{
["export"] = Modifier.Export,
["extern"] = Modifier.Extern,
};
private static readonly Dictionary<char[], Symbol> Chians = new()
{
[['=', '=']] = Symbol.Equal,
[['!', '=']] = Symbol.NotEqual,
[['<', '=']] = Symbol.LessThanOrEqual,
[['>', '=']] = Symbol.GreaterThanOrEqual,
[[':', ':']] = Symbol.DoubleColon,
};
private static readonly Dictionary<char, Symbol> Chars = new()
{
[';'] = Symbol.Semicolon,
[':'] = Symbol.Colon,
['('] = Symbol.OpenParen,
[')'] = Symbol.CloseParen,
['{'] = Symbol.OpenBrace,
['}'] = Symbol.CloseBrace,
['['] = Symbol.OpenBracket,
[']'] = Symbol.CloseBracket,
[','] = Symbol.Comma,
['.'] = Symbol.Period,
['='] = Symbol.Assign,
['<'] = Symbol.LessThan,
['>'] = Symbol.GreaterThan,
['+'] = Symbol.Plus,
['-'] = Symbol.Minus,
['*'] = Symbol.Star,
['/'] = Symbol.ForwardSlash,
['!'] = Symbol.Bang,
['^'] = Symbol.Caret,
['&'] = Symbol.Ampersand,
};
private static SourceText _sourceText;
private static int _index;
public static DiagnosticsResult<List<Token>> Tokenize(SourceText sourceText)
{
_sourceText = sourceText;
_index = 0;
List<Token> tokens = [];
while (ParseToken().TryGetValue(out var token))
{
tokens.Add(token);
}
return new DiagnosticsResult<List<Token>>([], tokens);
}
private static void ConsumeWhitespace()
{
while (Peek().TryGetValue(out var character) && char.IsWhiteSpace(character))
{
Next();
}
}
private static Optional<Token> ParseToken()
{
ConsumeWhitespace();
var startIndex = _index;
if (!Peek().TryGetValue(out var current))
{
return Optional<Token>.Empty();
}
if (current == '/' && Peek(1).TryGetValue(out var nextChar) && nextChar == '/')
{
Next();
Next();
if (Peek().TryGetValue(out var thirdChar) && thirdChar == '/')
{
Next();
var buffer = string.Empty;
while (Peek().TryGetValue(out var character) && character != '\n')
{
buffer += character;
Next();
}
Next();
return new DocumentationToken(CreateSpan(startIndex), buffer);
}
while (Peek().TryGetValue(out var character) && character != '\n')
{
Next();
}
Next();
return ParseToken();
}
if (char.IsLetter(current) || current == '_')
{
var buffer = string.Empty;
while (Peek().TryGetValue(out var next) && (char.IsLetterOrDigit(next) || next == '_'))
{
buffer += next;
Next();
}
if (Keywords.TryGetValue(buffer, out var keywordSymbol))
{
return new SymbolToken(CreateSpan(startIndex), keywordSymbol);
}
if (Modifiers.TryGetValue(buffer, out var modifer))
{
return new ModifierToken(CreateSpan(startIndex), modifer);
}
if (buffer is "true" or "false")
{
return new LiteralToken(CreateSpan(startIndex), LiteralKind.Bool, buffer);
}
return new IdentifierToken(CreateSpan(startIndex), buffer);
}
if (char.IsDigit(current))
{
var isFloat = false;
var buffer = string.Empty;
while (Peek().TryGetValue(out var next))
{
if (next == '.')
{
if (isFloat)
{
throw new Exception("More than one period found in float literal");
}
isFloat = true;
buffer += next;
Next();
}
else if (char.IsDigit(next))
{
buffer += next;
Next();
}
else
{
break;
}
}
return new LiteralToken(CreateSpan(startIndex), isFloat ? LiteralKind.Float : LiteralKind.Integer, buffer);
}
// TODO: Revisit this
foreach (var chain in Chians)
{
if (current != chain.Key[0]) continue;
for (var i = 1; i < chain.Key.Length; i++)
{
var c = Peek(i);
if (!c.HasValue || c.Value != chain.Key[i]) break;
if (i == chain.Key.Length - 1)
{
for (var j = 0; j <= i; j++)
{
Next();
}
return new SymbolToken(CreateSpan(startIndex), chain.Value);
}
}
}
if (Chars.TryGetValue(current, out var charSymbol))
{
Next();
return new SymbolToken(CreateSpan(startIndex), charSymbol);
}
if (current == '"')
{
Next();
var buffer = string.Empty;
while (true)
{
if (!Peek().TryGetValue(out var next))
{
throw new Exception("Unclosed string literal");
}
if (next == '"')
{
Next();
break;
}
buffer += next;
Next();
}
return new LiteralToken(CreateSpan(startIndex), LiteralKind.String, buffer);
}
throw new Exception($"Unknown character {current}");
}
private static SourceLocation CreateLocation(int index)
{
var line = 1;
var column = 1;
for (var i = 0; i < Math.Min(index, _sourceText.Content.Length - 1); i++)
{
if (_sourceText.Content[i] == '\n')
{
column = 1;
line += 1;
}
else
{
column += 1;
}
}
return new SourceLocation(line, column);
}
private static SourceSpan CreateSpan(int startIndex)
{
return new SourceSpan(_sourceText, CreateLocation(startIndex), CreateLocation(_index));
}
private static Optional<char> Peek(int offset = 0)
{
if (_index + offset < _sourceText.Content.Length)
{
return _sourceText.Content[_index + offset];
}
return Optional<char>.Empty();
}
private static void Next()
{
_index++;
}
}