...
This commit is contained in:
@@ -4,72 +4,9 @@ namespace NubLang.Syntax;
|
||||
|
||||
public sealed class Tokenizer
|
||||
{
|
||||
private static readonly Dictionary<string, Symbol> Keywords = new()
|
||||
{
|
||||
["func"] = Symbol.Func,
|
||||
["if"] = Symbol.If,
|
||||
["else"] = Symbol.Else,
|
||||
["while"] = Symbol.While,
|
||||
["for"] = Symbol.For,
|
||||
["in"] = Symbol.In,
|
||||
["break"] = Symbol.Break,
|
||||
["continue"] = Symbol.Continue,
|
||||
["return"] = Symbol.Return,
|
||||
["struct"] = Symbol.Struct,
|
||||
["let"] = Symbol.Let,
|
||||
["extern"] = Symbol.Extern,
|
||||
["module"] = Symbol.Module,
|
||||
["export"] = Symbol.Export,
|
||||
["import"] = Symbol.Import,
|
||||
["defer"] = Symbol.Defer,
|
||||
["enum"] = Symbol.Enum,
|
||||
};
|
||||
|
||||
private static readonly Dictionary<char[], Symbol> Symbols = new()
|
||||
{
|
||||
[['=', '=']] = Symbol.Equal,
|
||||
[['!', '=']] = Symbol.NotEqual,
|
||||
[['<', '=']] = Symbol.LessThanOrEqual,
|
||||
[['>', '=']] = Symbol.GreaterThanOrEqual,
|
||||
[['<', '<']] = Symbol.LeftShift,
|
||||
[['>', '>']] = Symbol.RightShift,
|
||||
[['&', '&']] = Symbol.And,
|
||||
[['|', '|']] = Symbol.Or,
|
||||
[[':', ':']] = Symbol.DoubleColon,
|
||||
[[':']] = Symbol.Colon,
|
||||
[['(']] = Symbol.OpenParen,
|
||||
[[')']] = Symbol.CloseParen,
|
||||
[['{']] = Symbol.OpenBrace,
|
||||
[['}']] = Symbol.CloseBrace,
|
||||
[['[']] = Symbol.OpenBracket,
|
||||
[[']']] = Symbol.CloseBracket,
|
||||
[[',']] = Symbol.Comma,
|
||||
[['.']] = Symbol.Period,
|
||||
[['=']] = Symbol.Assign,
|
||||
[['<']] = Symbol.LessThan,
|
||||
[['>']] = Symbol.GreaterThan,
|
||||
[['+']] = Symbol.Plus,
|
||||
[['-']] = Symbol.Minus,
|
||||
[['*']] = Symbol.Star,
|
||||
[['/']] = Symbol.ForwardSlash,
|
||||
[['!']] = Symbol.Bang,
|
||||
[['^']] = Symbol.Caret,
|
||||
[['&']] = Symbol.Ampersand,
|
||||
[[';']] = Symbol.Semi,
|
||||
[['%']] = Symbol.Percent,
|
||||
[['|']] = Symbol.Pipe,
|
||||
[['@']] = Symbol.At,
|
||||
[['?']] = Symbol.QuestionMark,
|
||||
};
|
||||
|
||||
private static readonly (char[] Pattern, Symbol Symbol)[] OrderedSymbols = Symbols
|
||||
.OrderByDescending(kvp => kvp.Key.Length)
|
||||
.Select(kvp => (kvp.Key, kvp.Value))
|
||||
.ToArray();
|
||||
|
||||
private readonly string _fileName;
|
||||
private readonly string _content;
|
||||
private int _index = 0;
|
||||
private int _index;
|
||||
private int _line = 1;
|
||||
private int _column = 1;
|
||||
|
||||
@@ -79,8 +16,8 @@ public sealed class Tokenizer
|
||||
_content = content;
|
||||
}
|
||||
|
||||
public List<Diagnostic> Diagnostics { get; } = [];
|
||||
public List<Token> Tokens { get; } = [];
|
||||
public List<Diagnostic> Diagnostics { get; } = new(16);
|
||||
public List<Token> Tokens { get; } = new(256);
|
||||
|
||||
public void Tokenize()
|
||||
{
|
||||
@@ -90,17 +27,17 @@ public sealed class Tokenizer
|
||||
_line = 1;
|
||||
_column = 1;
|
||||
|
||||
while (Peek().HasValue)
|
||||
while (_index < _content.Length)
|
||||
{
|
||||
try
|
||||
{
|
||||
var current = Peek()!.Value;
|
||||
var current = _content[_index];
|
||||
|
||||
if (char.IsWhiteSpace(current))
|
||||
{
|
||||
if (current is '\n')
|
||||
if (current == '\n')
|
||||
{
|
||||
_line += 1;
|
||||
// note(nub31): Next increments the column, so 0 is correct here
|
||||
_column = 0;
|
||||
}
|
||||
|
||||
@@ -108,10 +45,10 @@ public sealed class Tokenizer
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current == '/' && Peek(1) == '/')
|
||||
if (current == '/' && _index + 1 < _content.Length && _content[_index + 1] == '/')
|
||||
{
|
||||
// note(nub31): Keep newline so next iteration increments the line counter
|
||||
while (Peek() is not '\n')
|
||||
Next(2);
|
||||
while (_index < _content.Length && _content[_index] != '\n')
|
||||
{
|
||||
Next();
|
||||
}
|
||||
@@ -131,192 +68,312 @@ public sealed class Tokenizer
|
||||
|
||||
private Token ParseToken(char current, int lineStart, int columnStart)
|
||||
{
|
||||
if (char.IsLetter(current) || current == '_')
|
||||
{
|
||||
var buffer = string.Empty;
|
||||
|
||||
while (Peek() != null && (char.IsLetterOrDigit(Peek()!.Value) || Peek() == '_'))
|
||||
{
|
||||
buffer += Peek();
|
||||
Next();
|
||||
}
|
||||
|
||||
if (Keywords.TryGetValue(buffer, out var keywordSymbol))
|
||||
{
|
||||
return new SymbolToken(CreateSpan(lineStart, columnStart), keywordSymbol);
|
||||
}
|
||||
|
||||
if (buffer is "true" or "false")
|
||||
{
|
||||
return new BoolLiteralToken(CreateSpan(lineStart, columnStart), Convert.ToBoolean(buffer));
|
||||
}
|
||||
|
||||
return new IdentifierToken(CreateSpan(lineStart, columnStart), buffer);
|
||||
}
|
||||
|
||||
// Numbers
|
||||
if (char.IsDigit(current))
|
||||
{
|
||||
var buffer = string.Empty;
|
||||
|
||||
if (current == '0' && Peek(1) is 'x')
|
||||
{
|
||||
buffer += "0x";
|
||||
Next();
|
||||
Next();
|
||||
while (Peek() != null && Uri.IsHexDigit(Peek()!.Value))
|
||||
{
|
||||
buffer += Peek()!.Value;
|
||||
Next();
|
||||
}
|
||||
|
||||
if (buffer.Length <= 2)
|
||||
{
|
||||
throw new TokenizerException(Diagnostic
|
||||
.Error("Invalid hex literal, no digits found")
|
||||
.At(_fileName, _line, _column)
|
||||
.Build());
|
||||
}
|
||||
|
||||
return new IntLiteralToken(CreateSpan(lineStart, columnStart), buffer, 16);
|
||||
}
|
||||
|
||||
if (current == '0' && Peek(1) is 'b')
|
||||
{
|
||||
buffer += "0b";
|
||||
Next();
|
||||
Next();
|
||||
while (Peek() != null && (Peek() == '0' || Peek() == '1'))
|
||||
{
|
||||
buffer += Peek()!.Value;
|
||||
Next();
|
||||
}
|
||||
|
||||
if (buffer.Length <= 2)
|
||||
{
|
||||
throw new TokenizerException(Diagnostic
|
||||
.Error("Invalid binary literal, no digits found")
|
||||
.At(_fileName, _line, _column)
|
||||
.Build());
|
||||
}
|
||||
|
||||
return new IntLiteralToken(CreateSpan(lineStart, columnStart), buffer, 2);
|
||||
}
|
||||
|
||||
var isFloat = false;
|
||||
while (Peek() != null)
|
||||
{
|
||||
var next = Peek()!.Value;
|
||||
if (next == '.')
|
||||
{
|
||||
if (isFloat)
|
||||
{
|
||||
throw new TokenizerException(Diagnostic
|
||||
.Error("More than one period found in float literal")
|
||||
.At(_fileName, _line, _column)
|
||||
.Build());
|
||||
}
|
||||
|
||||
isFloat = true;
|
||||
buffer += next;
|
||||
Next();
|
||||
}
|
||||
else if (char.IsDigit(next))
|
||||
{
|
||||
buffer += next;
|
||||
Next();
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (isFloat)
|
||||
{
|
||||
return new FloatLiteralToken(CreateSpan(lineStart, columnStart), buffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
return new IntLiteralToken(CreateSpan(lineStart, columnStart), buffer, 10);
|
||||
}
|
||||
return ParseNumber(lineStart, columnStart);
|
||||
}
|
||||
|
||||
// String literals
|
||||
if (current == '"')
|
||||
{
|
||||
Next();
|
||||
var buffer = string.Empty;
|
||||
|
||||
while (true)
|
||||
{
|
||||
var next = Peek();
|
||||
if (!next.HasValue)
|
||||
{
|
||||
throw new TokenizerException(Diagnostic
|
||||
.Error("Unclosed string literal")
|
||||
.At(_fileName, _line, _column)
|
||||
.Build());
|
||||
}
|
||||
|
||||
if (next is '\n')
|
||||
{
|
||||
_line += 1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (next is '"')
|
||||
{
|
||||
Next();
|
||||
break;
|
||||
}
|
||||
|
||||
buffer += next;
|
||||
Next();
|
||||
}
|
||||
|
||||
return new StringLiteralToken(CreateSpan(lineStart, columnStart), buffer);
|
||||
return ParseString(lineStart, columnStart);
|
||||
}
|
||||
|
||||
foreach (var (pattern, symbol) in OrderedSymbols)
|
||||
// Try keywords and symbols by length (longest first)
|
||||
for (var i = 8; i >= 1; i--)
|
||||
{
|
||||
for (var i = 0; i < pattern.Length; i++)
|
||||
if (TryMatchSymbol(i, lineStart, columnStart, out var token))
|
||||
{
|
||||
var c = Peek(i);
|
||||
if (!c.HasValue || c.Value != pattern[i]) break;
|
||||
|
||||
if (i == pattern.Length - 1)
|
||||
{
|
||||
for (var j = 0; j <= i; j++)
|
||||
{
|
||||
Next();
|
||||
}
|
||||
|
||||
return new SymbolToken(CreateSpan(lineStart, columnStart), symbol);
|
||||
}
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
||||
// Identifiers
|
||||
if (char.IsLetter(current) || current == '_')
|
||||
{
|
||||
return ParseIdentifier(lineStart, columnStart);
|
||||
}
|
||||
|
||||
throw new TokenizerException(Diagnostic.Error($"Unknown token '{current}'").Build());
|
||||
}
|
||||
|
||||
private Token ParseNumber(int lineStart, int columnStart)
|
||||
{
|
||||
var start = _index;
|
||||
var current = _content[_index];
|
||||
|
||||
// Hex literal
|
||||
if (current == '0' && _index + 1 < _content.Length && _content[_index + 1] == 'x')
|
||||
{
|
||||
Next(2);
|
||||
var digitStart = _index;
|
||||
|
||||
while (_index < _content.Length && Uri.IsHexDigit(_content[_index]))
|
||||
{
|
||||
Next();
|
||||
}
|
||||
|
||||
if (_index == digitStart)
|
||||
{
|
||||
throw new TokenizerException(Diagnostic
|
||||
.Error("Invalid hex literal, no digits found")
|
||||
.At(_fileName, _line, _column)
|
||||
.Build());
|
||||
}
|
||||
|
||||
return new IntLiteralToken(
|
||||
CreateSpan(lineStart, columnStart),
|
||||
_content.Substring(start, _index - start),
|
||||
16);
|
||||
}
|
||||
|
||||
// Binary literal
|
||||
if (current == '0' && _index + 1 < _content.Length && _content[_index + 1] == 'b')
|
||||
{
|
||||
Next(2);
|
||||
var digitStart = _index;
|
||||
|
||||
while (_index < _content.Length && (_content[_index] == '0' || _content[_index] == '1'))
|
||||
{
|
||||
Next();
|
||||
}
|
||||
|
||||
if (_index == digitStart)
|
||||
{
|
||||
throw new TokenizerException(Diagnostic
|
||||
.Error("Invalid binary literal, no digits found")
|
||||
.At(_fileName, _line, _column)
|
||||
.Build());
|
||||
}
|
||||
|
||||
return new IntLiteralToken(
|
||||
CreateSpan(lineStart, columnStart),
|
||||
_content.Substring(start, _index - start),
|
||||
2);
|
||||
}
|
||||
|
||||
// Decimal or float
|
||||
var isFloat = false;
|
||||
while (_index < _content.Length)
|
||||
{
|
||||
var next = _content[_index];
|
||||
|
||||
if (next == '.')
|
||||
{
|
||||
if (isFloat)
|
||||
{
|
||||
throw new TokenizerException(Diagnostic
|
||||
.Error("More than one period found in float literal")
|
||||
.At(_fileName, _line, _column)
|
||||
.Build());
|
||||
}
|
||||
|
||||
isFloat = true;
|
||||
Next();
|
||||
}
|
||||
else if (char.IsDigit(next))
|
||||
{
|
||||
Next();
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
var buffer = _content.Substring(start, _index - start);
|
||||
|
||||
return isFloat
|
||||
? new FloatLiteralToken(CreateSpan(lineStart, columnStart), buffer)
|
||||
: new IntLiteralToken(CreateSpan(lineStart, columnStart), buffer, 10);
|
||||
}
|
||||
|
||||
private StringLiteralToken ParseString(int lineStart, int columnStart)
|
||||
{
|
||||
Next(); // Skip opening quote
|
||||
var start = _index;
|
||||
|
||||
while (true)
|
||||
{
|
||||
if (_index >= _content.Length)
|
||||
{
|
||||
throw new TokenizerException(Diagnostic
|
||||
.Error("Unclosed string literal")
|
||||
.At(_fileName, _line, _column)
|
||||
.Build());
|
||||
}
|
||||
|
||||
var next = _content[_index];
|
||||
|
||||
if (next == '\n')
|
||||
{
|
||||
throw new TokenizerException(Diagnostic
|
||||
.Error("Unclosed string literal (newline found)")
|
||||
.At(_fileName, _line, _column)
|
||||
.Build());
|
||||
}
|
||||
|
||||
if (next == '"')
|
||||
{
|
||||
var buffer = _content.Substring(start, _index - start);
|
||||
Next();
|
||||
return new StringLiteralToken(CreateSpan(lineStart, columnStart), buffer);
|
||||
}
|
||||
|
||||
Next();
|
||||
}
|
||||
}
|
||||
|
||||
private bool TryMatchSymbol(int length, int lineStart, int columnStart, out Token token)
|
||||
{
|
||||
token = null!;
|
||||
|
||||
if (_index + length > _content.Length)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var span = _content.AsSpan(_index, length);
|
||||
|
||||
var symbol = length switch
|
||||
{
|
||||
8 => span switch
|
||||
{
|
||||
"continue" => Symbol.Continue,
|
||||
_ => Symbol.None
|
||||
},
|
||||
6 => span switch
|
||||
{
|
||||
"return" => Symbol.Return,
|
||||
"struct" => Symbol.Struct,
|
||||
"extern" => Symbol.Extern,
|
||||
"module" => Symbol.Module,
|
||||
"export" => Symbol.Export,
|
||||
"import" => Symbol.Import,
|
||||
_ => Symbol.None
|
||||
},
|
||||
5 => span switch
|
||||
{
|
||||
"break" => Symbol.Break,
|
||||
"while" => Symbol.While,
|
||||
"defer" => Symbol.Defer,
|
||||
_ => Symbol.None
|
||||
},
|
||||
4 => span switch
|
||||
{
|
||||
"func" => Symbol.Func,
|
||||
"else" => Symbol.Else,
|
||||
"enum" => Symbol.Enum,
|
||||
_ => Symbol.None
|
||||
},
|
||||
3 => span switch
|
||||
{
|
||||
"for" => Symbol.For,
|
||||
"let" => Symbol.Let,
|
||||
_ => Symbol.None
|
||||
},
|
||||
2 => span switch
|
||||
{
|
||||
"if" => Symbol.If,
|
||||
"in" => Symbol.In,
|
||||
"==" => Symbol.Equal,
|
||||
"!=" => Symbol.NotEqual,
|
||||
"<=" => Symbol.LessThanOrEqual,
|
||||
">=" => Symbol.GreaterThanOrEqual,
|
||||
"<<" => Symbol.LeftShift,
|
||||
">>" => Symbol.RightShift,
|
||||
"&&" => Symbol.And,
|
||||
"||" => Symbol.Or,
|
||||
"::" => Symbol.DoubleColon,
|
||||
_ => Symbol.None
|
||||
},
|
||||
1 => span[0] switch
|
||||
{
|
||||
':' => Symbol.Colon,
|
||||
'(' => Symbol.OpenParen,
|
||||
')' => Symbol.CloseParen,
|
||||
'{' => Symbol.OpenBrace,
|
||||
'}' => Symbol.CloseBrace,
|
||||
'[' => Symbol.OpenBracket,
|
||||
']' => Symbol.CloseBracket,
|
||||
',' => Symbol.Comma,
|
||||
'.' => Symbol.Period,
|
||||
'=' => Symbol.Assign,
|
||||
'<' => Symbol.LessThan,
|
||||
'>' => Symbol.GreaterThan,
|
||||
'+' => Symbol.Plus,
|
||||
'-' => Symbol.Minus,
|
||||
'*' => Symbol.Star,
|
||||
'/' => Symbol.ForwardSlash,
|
||||
'!' => Symbol.Bang,
|
||||
'^' => Symbol.Caret,
|
||||
'&' => Symbol.Ampersand,
|
||||
';' => Symbol.Semi,
|
||||
'%' => Symbol.Percent,
|
||||
'|' => Symbol.Pipe,
|
||||
'@' => Symbol.At,
|
||||
'?' => Symbol.QuestionMark,
|
||||
_ => Symbol.None
|
||||
},
|
||||
_ => Symbol.None
|
||||
};
|
||||
|
||||
if (symbol != Symbol.None)
|
||||
{
|
||||
var isAlphaKeyword = char.IsLetter(span[0]);
|
||||
if (isAlphaKeyword)
|
||||
{
|
||||
var nextIdx = _index + length;
|
||||
if (nextIdx < _content.Length)
|
||||
{
|
||||
var nextChar = _content[nextIdx];
|
||||
if (char.IsLetterOrDigit(nextChar) || nextChar == '_')
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Next(length);
|
||||
token = new SymbolToken(CreateSpan(lineStart, columnStart), symbol);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private IdentifierToken ParseIdentifier(int lineStart, int columnStart)
|
||||
{
|
||||
var start = _index;
|
||||
|
||||
while (_index < _content.Length)
|
||||
{
|
||||
var ch = _content[_index];
|
||||
if (char.IsLetterOrDigit(ch) || ch == '_')
|
||||
{
|
||||
Next();
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return new IdentifierToken(
|
||||
CreateSpan(lineStart, columnStart),
|
||||
_content.Substring(start, _index - start));
|
||||
}
|
||||
|
||||
private SourceSpan CreateSpan(int lineStart, int columnStart)
|
||||
{
|
||||
return new SourceSpan(_fileName, new SourceLocation(lineStart, columnStart), new SourceLocation(_line, _column));
|
||||
}
|
||||
|
||||
private char? Peek(int offset = 0)
|
||||
private void Next(int count = 1)
|
||||
{
|
||||
if (_index + offset < _content.Length)
|
||||
{
|
||||
return _content[_index + offset];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private void Next()
|
||||
{
|
||||
_index += 1;
|
||||
_column += 1;
|
||||
_index += count;
|
||||
_column += count;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user