Files
nub-lang/compiler/NubLang/Syntax/Tokenizer.cs
nub31 560e6428ff ...
2025-10-26 22:28:48 +01:00

378 lines
10 KiB
C#

using NubLang.Diagnostics;
namespace NubLang.Syntax;
public sealed class Tokenizer
{
private readonly string _fileName;
private readonly string _content;
private int _index;
private int _line = 1;
private int _column = 1;
public Tokenizer(string fileName, string content)
{
_fileName = fileName;
_content = content;
}
public List<Diagnostic> Diagnostics { get; } = new(16);
public List<Token> Tokens { get; } = new(256);
public void Tokenize()
{
Diagnostics.Clear();
Tokens.Clear();
_index = 0;
_line = 1;
_column = 1;
while (_index < _content.Length)
{
try
{
var current = _content[_index];
if (char.IsWhiteSpace(current))
{
if (current == '\n')
{
_line += 1;
_column = 0;
}
Next();
continue;
}
if (current == '/' && _index + 1 < _content.Length && _content[_index + 1] == '/')
{
Next(2);
while (_index < _content.Length && _content[_index] != '\n')
{
Next();
}
continue;
}
Tokens.Add(ParseToken(current, _line, _column));
}
catch (CompileException e)
{
Diagnostics.Add(e.Diagnostic);
Next();
}
}
}
private Token ParseToken(char current, int lineStart, int columnStart)
{
// Numbers
if (char.IsDigit(current))
{
return ParseNumber(lineStart, columnStart);
}
// String literals
if (current == '"')
{
return ParseString(lineStart, columnStart);
}
// Try keywords and symbols by length (longest first)
for (var i = 8; i >= 1; i--)
{
if (TryMatchSymbol(i, lineStart, columnStart, out var token))
{
return token;
}
}
// Identifiers
if (char.IsLetter(current) || current == '_')
{
return ParseIdentifier(lineStart, columnStart);
}
throw new CompileException(Diagnostic.Error($"Unknown token '{current}'").Build());
}
private Token ParseNumber(int lineStart, int columnStart)
{
var start = _index;
var current = _content[_index];
// Hex literal
if (current == '0' && _index + 1 < _content.Length && _content[_index + 1] == 'x')
{
Next(2);
var digitStart = _index;
while (_index < _content.Length && Uri.IsHexDigit(_content[_index]))
{
Next();
}
if (_index == digitStart)
{
throw new CompileException(Diagnostic
.Error("Invalid hex literal, no digits found")
.At(_fileName, _line, _column)
.Build());
}
return new IntLiteralToken(
CreateSpan(lineStart, columnStart),
_content.Substring(start, _index - start),
16);
}
// Binary literal
if (current == '0' && _index + 1 < _content.Length && _content[_index + 1] == 'b')
{
Next(2);
var digitStart = _index;
while (_index < _content.Length && (_content[_index] == '0' || _content[_index] == '1'))
{
Next();
}
if (_index == digitStart)
{
throw new CompileException(Diagnostic
.Error("Invalid binary literal, no digits found")
.At(_fileName, _line, _column)
.Build());
}
return new IntLiteralToken(
CreateSpan(lineStart, columnStart),
_content.Substring(start, _index - start),
2);
}
// Decimal or float
var isFloat = false;
while (_index < _content.Length)
{
var next = _content[_index];
if (next == '.')
{
if (isFloat)
{
throw new CompileException(Diagnostic
.Error("More than one period found in float literal")
.At(_fileName, _line, _column)
.Build());
}
isFloat = true;
Next();
}
else if (char.IsDigit(next))
{
Next();
}
else
{
break;
}
}
var buffer = _content.Substring(start, _index - start);
return isFloat
? new FloatLiteralToken(CreateSpan(lineStart, columnStart), buffer)
: new IntLiteralToken(CreateSpan(lineStart, columnStart), buffer, 10);
}
private StringLiteralToken ParseString(int lineStart, int columnStart)
{
Next(); // Skip opening quote
var start = _index;
while (true)
{
if (_index >= _content.Length)
{
throw new CompileException(Diagnostic
.Error("Unclosed string literal")
.At(_fileName, _line, _column)
.Build());
}
var next = _content[_index];
if (next == '\n')
{
throw new CompileException(Diagnostic
.Error("Unclosed string literal (newline found)")
.At(_fileName, _line, _column)
.Build());
}
if (next == '"')
{
var buffer = _content.Substring(start, _index - start);
Next();
return new StringLiteralToken(CreateSpan(lineStart, columnStart), buffer);
}
Next();
}
}
private bool TryMatchSymbol(int length, int lineStart, int columnStart, out Token token)
{
token = null!;
if (_index + length > _content.Length)
{
return false;
}
var span = _content.AsSpan(_index, length);
var symbol = length switch
{
8 => span switch
{
"continue" => Symbol.Continue,
_ => Symbol.None
},
6 => span switch
{
"return" => Symbol.Return,
"struct" => Symbol.Struct,
"extern" => Symbol.Extern,
"module" => Symbol.Module,
"export" => Symbol.Export,
"import" => Symbol.Import,
_ => Symbol.None
},
5 => span switch
{
"break" => Symbol.Break,
"while" => Symbol.While,
"defer" => Symbol.Defer,
_ => Symbol.None
},
4 => span switch
{
"func" => Symbol.Func,
"else" => Symbol.Else,
"enum" => Symbol.Enum,
_ => Symbol.None
},
3 => span switch
{
"for" => Symbol.For,
"let" => Symbol.Let,
_ => Symbol.None
},
2 => span switch
{
"if" => Symbol.If,
"in" => Symbol.In,
"==" => Symbol.Equal,
"!=" => Symbol.NotEqual,
"<=" => Symbol.LessThanOrEqual,
">=" => Symbol.GreaterThanOrEqual,
"<<" => Symbol.LeftShift,
">>" => Symbol.RightShift,
"&&" => Symbol.And,
"||" => Symbol.Or,
"::" => Symbol.DoubleColon,
_ => Symbol.None
},
1 => span[0] switch
{
':' => Symbol.Colon,
'(' => Symbol.OpenParen,
')' => Symbol.CloseParen,
'{' => Symbol.OpenBrace,
'}' => Symbol.CloseBrace,
'[' => Symbol.OpenBracket,
']' => Symbol.CloseBracket,
',' => Symbol.Comma,
'.' => Symbol.Period,
'=' => Symbol.Assign,
'<' => Symbol.LessThan,
'>' => Symbol.GreaterThan,
'+' => Symbol.Plus,
'-' => Symbol.Minus,
'*' => Symbol.Star,
'/' => Symbol.ForwardSlash,
'!' => Symbol.Bang,
'^' => Symbol.Caret,
'&' => Symbol.Ampersand,
';' => Symbol.Semi,
'%' => Symbol.Percent,
'|' => Symbol.Pipe,
'@' => Symbol.At,
'?' => Symbol.QuestionMark,
_ => Symbol.None
},
_ => Symbol.None
};
if (symbol != Symbol.None)
{
var isAlphaKeyword = char.IsLetter(span[0]);
if (isAlphaKeyword)
{
var nextIdx = _index + length;
if (nextIdx < _content.Length)
{
var nextChar = _content[nextIdx];
if (char.IsLetterOrDigit(nextChar) || nextChar == '_')
{
return false;
}
}
}
Next(length);
token = new SymbolToken(CreateSpan(lineStart, columnStart), symbol);
return true;
}
return false;
}
private IdentifierToken ParseIdentifier(int lineStart, int columnStart)
{
var start = _index;
while (_index < _content.Length)
{
var ch = _content[_index];
if (char.IsLetterOrDigit(ch) || ch == '_')
{
Next();
}
else
{
break;
}
}
return new IdentifierToken(
CreateSpan(lineStart, columnStart),
_content.Substring(start, _index - start));
}
private SourceSpan CreateSpan(int lineStart, int columnStart)
{
return new SourceSpan(_fileName, new SourceLocation(lineStart, columnStart), new SourceLocation(_line, _column));
}
private void Next(int count = 1)
{
_index += count;
_column += count;
}
}