327 lines
9.2 KiB
C#
327 lines
9.2 KiB
C#
using NubLang.Diagnostics;
|
|
|
|
namespace NubLang.Syntax;
|
|
|
|
public sealed class Tokenizer
|
|
{
|
|
private static readonly Dictionary<string, Symbol> Keywords = new()
|
|
{
|
|
["func"] = Symbol.Func,
|
|
["if"] = Symbol.If,
|
|
["else"] = Symbol.Else,
|
|
["while"] = Symbol.While,
|
|
["break"] = Symbol.Break,
|
|
["continue"] = Symbol.Continue,
|
|
["return"] = Symbol.Return,
|
|
["struct"] = Symbol.Struct,
|
|
["let"] = Symbol.Let,
|
|
["extern"] = Symbol.Extern,
|
|
["module"] = Symbol.Module,
|
|
["export"] = Symbol.Export,
|
|
["import"] = Symbol.Import,
|
|
["defer"] = Symbol.Defer,
|
|
};
|
|
|
|
private static readonly Dictionary<char[], Symbol> Symbols = new()
|
|
{
|
|
[['=', '=']] = Symbol.Equal,
|
|
[['!', '=']] = Symbol.NotEqual,
|
|
[['<', '=']] = Symbol.LessThanOrEqual,
|
|
[['>', '=']] = Symbol.GreaterThanOrEqual,
|
|
[['<', '<']] = Symbol.LeftShift,
|
|
[['>', '>']] = Symbol.RightShift,
|
|
[['&', '&']] = Symbol.And,
|
|
[['|', '|']] = Symbol.Or,
|
|
[[':', ':']] = Symbol.DoubleColon,
|
|
[[':']] = Symbol.Colon,
|
|
[['(']] = Symbol.OpenParen,
|
|
[[')']] = Symbol.CloseParen,
|
|
[['{']] = Symbol.OpenBrace,
|
|
[['}']] = Symbol.CloseBrace,
|
|
[['[']] = Symbol.OpenBracket,
|
|
[[']']] = Symbol.CloseBracket,
|
|
[[',']] = Symbol.Comma,
|
|
[['.']] = Symbol.Period,
|
|
[['=']] = Symbol.Assign,
|
|
[['<']] = Symbol.LessThan,
|
|
[['>']] = Symbol.GreaterThan,
|
|
[['+']] = Symbol.Plus,
|
|
[['-']] = Symbol.Minus,
|
|
[['*']] = Symbol.Star,
|
|
[['/']] = Symbol.ForwardSlash,
|
|
[['!']] = Symbol.Bang,
|
|
[['^']] = Symbol.Caret,
|
|
[['&']] = Symbol.Ampersand,
|
|
[[';']] = Symbol.Semi,
|
|
[['%']] = Symbol.Percent,
|
|
[['|']] = Symbol.Pipe,
|
|
[['@']] = Symbol.At,
|
|
};
|
|
|
|
private static readonly (char[] Pattern, Symbol Symbol)[] OrderedSymbols = Symbols
|
|
.OrderByDescending(kvp => kvp.Key.Length)
|
|
.Select(kvp => (kvp.Key, kvp.Value))
|
|
.ToArray();
|
|
|
|
private readonly string _fileName;
|
|
private readonly string _content;
|
|
private int _index = 0;
|
|
private int _line = 1;
|
|
private int _column = 1;
|
|
|
|
public Tokenizer(string fileName, string content)
|
|
{
|
|
_fileName = fileName;
|
|
_content = content;
|
|
}
|
|
|
|
public List<Diagnostic> Diagnostics { get; } = [];
|
|
public List<Token> Tokens { get; } = [];
|
|
|
|
public void Tokenize()
|
|
{
|
|
Diagnostics.Clear();
|
|
Tokens.Clear();
|
|
_index = 0;
|
|
_line = 1;
|
|
_column = 1;
|
|
|
|
while (Peek().HasValue)
|
|
{
|
|
try
|
|
{
|
|
// Skip whitespace and increment line counter if newline
|
|
var current = Peek()!.Value;
|
|
if (char.IsWhiteSpace(current))
|
|
{
|
|
if (current is '\n')
|
|
{
|
|
_line += 1;
|
|
_column = 1;
|
|
}
|
|
|
|
Next();
|
|
continue;
|
|
}
|
|
|
|
// Skip single line comments but keep newline so next iteration increments the line counter
|
|
if (current == '/' && Peek(1) == '/')
|
|
{
|
|
while (Peek() is not '\n')
|
|
{
|
|
Next();
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
Tokens.Add(ParseToken(current, _line, _column));
|
|
}
|
|
catch (TokenizerException e)
|
|
{
|
|
Diagnostics.Add(e.Diagnostic);
|
|
Next();
|
|
}
|
|
}
|
|
}
|
|
|
|
private Token ParseToken(char current, int lineStart, int columnStart)
|
|
{
|
|
if (char.IsLetter(current) || current == '_')
|
|
{
|
|
var buffer = string.Empty;
|
|
|
|
while (Peek() != null && (char.IsLetterOrDigit(Peek()!.Value) || Peek() == '_'))
|
|
{
|
|
buffer += Peek();
|
|
Next();
|
|
}
|
|
|
|
if (Keywords.TryGetValue(buffer, out var keywordSymbol))
|
|
{
|
|
return new SymbolToken(CreateSpan(lineStart, columnStart), keywordSymbol);
|
|
}
|
|
|
|
if (buffer is "true" or "false")
|
|
{
|
|
return new BoolLiteralToken(CreateSpan(lineStart, columnStart), Convert.ToBoolean(buffer));
|
|
}
|
|
|
|
return new IdentifierToken(CreateSpan(lineStart, columnStart), buffer);
|
|
}
|
|
|
|
if (char.IsDigit(current))
|
|
{
|
|
var buffer = string.Empty;
|
|
|
|
if (current == '0' && Peek(1) is 'x')
|
|
{
|
|
buffer += "0x";
|
|
Next();
|
|
Next();
|
|
while (Peek() != null && Uri.IsHexDigit(Peek()!.Value))
|
|
{
|
|
buffer += Peek()!.Value;
|
|
Next();
|
|
}
|
|
|
|
if (buffer.Length <= 2)
|
|
{
|
|
throw new TokenizerException(Diagnostic
|
|
.Error("Invalid hex literal, no digits found")
|
|
.At(_fileName, _line, _column)
|
|
.Build());
|
|
}
|
|
|
|
return new IntLiteralToken(CreateSpan(lineStart, columnStart), buffer, 16);
|
|
}
|
|
|
|
if (current == '0' && Peek(1) is 'b')
|
|
{
|
|
buffer += "0b";
|
|
Next();
|
|
Next();
|
|
while (Peek() != null && (Peek() == '0' || Peek() == '1'))
|
|
{
|
|
buffer += Peek()!.Value;
|
|
Next();
|
|
}
|
|
|
|
if (buffer.Length <= 2)
|
|
{
|
|
throw new TokenizerException(Diagnostic
|
|
.Error("Invalid binary literal, no digits found")
|
|
.At(_fileName, _line, _column)
|
|
.Build());
|
|
}
|
|
|
|
return new IntLiteralToken(CreateSpan(lineStart, columnStart), buffer, 2);
|
|
}
|
|
|
|
var isFloat = false;
|
|
while (Peek() != null)
|
|
{
|
|
var next = Peek()!.Value;
|
|
if (next == '.')
|
|
{
|
|
if (isFloat)
|
|
{
|
|
throw new TokenizerException(Diagnostic
|
|
.Error("More than one period found in float literal")
|
|
.At(_fileName, _line, _column)
|
|
.Build());
|
|
}
|
|
|
|
isFloat = true;
|
|
buffer += next;
|
|
Next();
|
|
}
|
|
else if (char.IsDigit(next))
|
|
{
|
|
buffer += next;
|
|
Next();
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (isFloat)
|
|
{
|
|
return new FloatLiteralToken(CreateSpan(lineStart, columnStart), buffer);
|
|
}
|
|
else
|
|
{
|
|
return new IntLiteralToken(CreateSpan(lineStart, columnStart), buffer, 10);
|
|
}
|
|
}
|
|
|
|
if (current == '"')
|
|
{
|
|
Next();
|
|
var buffer = string.Empty;
|
|
|
|
while (true)
|
|
{
|
|
var next = Peek();
|
|
if (!next.HasValue)
|
|
{
|
|
throw new TokenizerException(Diagnostic
|
|
.Error("Unclosed string literal")
|
|
.At(_fileName, _line, _column)
|
|
.Build());
|
|
}
|
|
|
|
if (next is '\n')
|
|
{
|
|
_line += 1;
|
|
break;
|
|
}
|
|
|
|
if (next is '"')
|
|
{
|
|
Next();
|
|
break;
|
|
}
|
|
|
|
buffer += next;
|
|
Next();
|
|
}
|
|
|
|
return new StringLiteralToken(CreateSpan(lineStart, columnStart), buffer);
|
|
}
|
|
|
|
foreach (var (pattern, symbol) in OrderedSymbols)
|
|
{
|
|
for (var i = 0; i < pattern.Length; i++)
|
|
{
|
|
var c = Peek(i);
|
|
if (!c.HasValue || c.Value != pattern[i]) break;
|
|
|
|
if (i == pattern.Length - 1)
|
|
{
|
|
for (var j = 0; j <= i; j++)
|
|
{
|
|
Next();
|
|
}
|
|
|
|
return new SymbolToken(CreateSpan(lineStart, columnStart), symbol);
|
|
}
|
|
}
|
|
}
|
|
|
|
throw new TokenizerException(Diagnostic.Error($"Unknown token '{current}'").Build());
|
|
}
|
|
|
|
private SourceSpan CreateSpan(int lineStart, int columnStart)
|
|
{
|
|
return new SourceSpan(_fileName, new SourceLocation(lineStart, columnStart), new SourceLocation(_line, _column));
|
|
}
|
|
|
|
private char? Peek(int offset = 0)
|
|
{
|
|
if (_index + offset < _content.Length)
|
|
{
|
|
return _content[_index + offset];
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
private void Next()
|
|
{
|
|
_index += 1;
|
|
_column += 1;
|
|
}
|
|
}
|
|
|
|
public class TokenizerException : Exception
|
|
{
|
|
public Diagnostic Diagnostic { get; }
|
|
|
|
public TokenizerException(Diagnostic diagnostic) : base(diagnostic.Message)
|
|
{
|
|
Diagnostic = diagnostic;
|
|
}
|
|
} |