schema_editor/comdel/parser/comdel_lexer.cpp

387 lines
10 KiB
C++
Raw Permalink Normal View History

2022-06-09 18:24:27 +00:00
#include "comdel_lexer.h"
2022-03-29 19:31:45 +00:00
#include "token.h"
2022-05-27 06:18:17 +00:00
#include "tokens_type.h"
2022-03-29 19:31:45 +00:00
#include <fstream>
#include <cctype>
/*************************************************************************
*
* Auxiliary functions - recognizing category of characters
*
*************************************************************************/
inline bool identifierStart(char ch) {
return isalpha(ch) || ch == '_';
}
inline bool identifierContinue(char ch) {
return isalnum(ch) || ch == '_';
}
inline bool numberStart(char ch) {
return isdigit(ch) || ch == '-';
2022-03-29 19:31:45 +00:00
}
inline bool isWhitespace(char ch) {
return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r';
}
/*************************************************************************
*
* ComdelLexer class public methods
*
*************************************************************************/
ComdelLexer::ComdelLexer(std::string fileName, std::string source,
2022-05-27 06:18:17 +00:00
ParseContext *parseContext)
: source(std::move(source)),
parseContext(parseContext),
// TODO Update this
fileId(this->parseContext->addFile(fileName, this->source)),
position(this->fileId, 1, 1, 0),
ch(this->source[0]) {}
2022-03-29 19:31:45 +00:00
LexerResult ComdelLexer::tokenize() {
while (!eof()) {
tokenBegin = position;
auto tokenType = nextTokenType();
2022-05-27 06:18:17 +00:00
if (!tokenType) {
2022-03-29 19:31:45 +00:00
errors.push_back(tokenType.error());
continue;
}
2022-05-27 06:18:17 +00:00
if (tokenType == TokenType::WHITESPACE || tokenType == TokenType::COMMENT) {
2022-03-29 19:31:45 +00:00
continue;
}
std::string text = source.substr(tokenBegin.offset,
position.offset - tokenBegin.offset);
2022-03-29 21:08:55 +00:00
tokenType = from_token(text, tokenType.value());
tokens.emplace_back(*tokenType, Span(tokenBegin, position), text);
2022-03-29 19:31:45 +00:00
}
2022-05-27 06:18:17 +00:00
tokens.emplace_back(TokenType::END_OF_FILE, Span(position), "");
2022-03-29 19:31:45 +00:00
2022-05-27 06:18:17 +00:00
return LexerResult{tokens, errors};
2022-03-29 19:31:45 +00:00
}
/*************************************************************************
*
* ComdelLexer class private methods
*
*************************************************************************/
// Numbers are sequences of digits and underscores
// with AT LEAST ONE digit.
unsigned ComdelLexer::takeNumberInRadix(Radix radix) {
unsigned digitsTaken = 0;
2022-05-27 06:18:17 +00:00
while (true) {
2022-03-29 19:31:45 +00:00
if (digitIsValid(ch, radix)) { // skip and count real digits
bump();
digitsTaken++;
2022-05-27 06:18:17 +00:00
} else if (ch == '_') { // skip underscores
2022-03-29 19:31:45 +00:00
bump();
} else { // some other character => end of number
return digitsTaken;
}
}
}
bool ComdelLexer::digitIsValid(char ch, Radix radix) {
2022-04-08 18:17:58 +00:00
// Check valid digits
// digit is 0..1
2022-05-27 06:18:17 +00:00
if (ch == '1' || ch == '0') {
2022-03-29 19:31:45 +00:00
// OK for all radixes
return true;
}
//digit is 2..9
2022-05-27 06:18:17 +00:00
if (isdigit(ch)) {
2022-03-29 19:31:45 +00:00
// OK for decimal and hex radixes
2022-05-27 06:18:17 +00:00
return (radix != Radix::BIN_NUMBER);
2022-03-29 19:31:45 +00:00
}
2022-05-27 06:18:17 +00:00
// digit is a..f A..F
else if (isxdigit(ch)) {
2022-03-29 19:31:45 +00:00
// OK only for hex radix
2022-05-27 06:18:17 +00:00
return (radix == Radix::HEX_NUMBER);
2022-03-29 19:31:45 +00:00
}
// NOT 0..9 a..f A..F
return false;
}
2022-05-27 06:18:17 +00:00
ComdelLexer::Radix ComdelLexer::takeRadix() {
2022-03-29 19:31:45 +00:00
if (ch == '0') {
char nextChar = peek();
if (nextChar == 'x' || nextChar == 'X') {
bump(2);
return Radix::HEX_NUMBER;
} else if (nextChar == 'b' || nextChar == 'B') {
bump(2);
return Radix::BIN_NUMBER;
}
}
return Radix::DEC_NUMBER;
}
2022-04-08 18:17:58 +00:00
// color is sequence of hex-digits (preceded with # which is already consumed)
2022-03-29 19:31:45 +00:00
unsigned ComdelLexer::takeHexColor() {
unsigned digitsTaken = 0;
2022-05-27 06:18:17 +00:00
while (true) {
2022-03-29 19:31:45 +00:00
if (digitIsValid(ch, Radix::HEX_NUMBER)) { // skip and count hex digits
bump();
digitsTaken++;
} else { // some other character => end of number
return digitsTaken;
}
}
}
PResult<TokenType> ComdelLexer::takeString() {
Position lo = position;
if (ch != '"')
return PError({Span(lo), "expected string here"});
bump(); // skip starting "
2022-04-08 18:17:58 +00:00
// Check escape-sequences \t \n \\ \" but leave them in string.
2022-03-29 19:31:45 +00:00
// They will be replaced in the constructor of StringLiteral AST-node.
while (ch != '"' && ch != '\n' && !eof()) {
2022-05-27 06:18:17 +00:00
if (ch == '"') {
2022-03-29 19:31:45 +00:00
// possible start of escape-sequence
char nextCh = peek();
2022-05-27 06:18:17 +00:00
if (nextCh == '\\' || nextCh == '\"' || nextCh == 't' || nextCh == 'n')
2022-03-29 19:31:45 +00:00
bump(); // legal escape-sequence: skip backslash
else
return PError({Span(lo, position), "illegal escape-sequence (allowed: \\n \\\" \\\\ \\t"});
}
bump(); // skip normal char OR skip second char in escape-sequence
}
if (eof() || ch == '\n') {
return PError({Span(lo, position), "unterminated string"});
}
bump(); // skip closing "
2022-05-27 06:18:17 +00:00
return TokenType::STRING;
2022-03-29 19:31:45 +00:00
}
2022-05-27 06:18:17 +00:00
PResult<TokenType> ComdelLexer::takeRawString() {
2022-03-29 19:31:45 +00:00
Position lo = position;
if (ch != '`')
return PError({Span(lo), "expected string here"});
bump(); // skip starting '
2022-04-08 18:17:58 +00:00
// Ignore escape-sequences - take all characters until closing '
2022-03-29 19:31:45 +00:00
while (ch != '`' && !eof()) {
bump(); // skip characters
}
if (eof() || ch == '\n') {
return PError({Span(lo, position), "unterminated string"});
}
bump(); // skip closing '
2022-05-27 06:18:17 +00:00
return TokenType::STRING;
2022-03-29 19:31:45 +00:00
}
void ComdelLexer::skipWhitespace() {
while (!eof()) {
if (isWhitespace(ch)) {
bump();
} else {
break;
}
}
}
void ComdelLexer::skipComment() {
while (!eof() && ch != '\n') {
bump();
}
}
// returns false if the comment is unterminated
bool ComdelLexer::skipMultilineComment() {
while (!eof()) {
if (ch == '*') {
bump();
if (ch == '/') {
bump();
return true;
}
}
bump();
}
return false;
}
PResult<TokenType> ComdelLexer::nextTokenType() {
2022-05-27 06:18:17 +00:00
if (isWhitespace(ch)) {
2022-03-29 19:31:45 +00:00
skipWhitespace();
2022-05-27 06:18:17 +00:00
return TokenType::WHITESPACE;
} else if (identifierStart(ch)) {
2022-03-29 19:31:45 +00:00
bump();
while (identifierContinue(ch))
bump();
2022-05-27 06:18:17 +00:00
return TokenType::IDENTIFIER;
} else if (numberStart(ch)) {
if (ch == '-') {
bump();
}
2022-03-29 19:31:45 +00:00
unsigned takenDigits;
Radix radix = takeRadix();
takenDigits = takeNumberInRadix(radix);
if (takenDigits == 0) {
return PError({Span(tokenBegin, position),
2022-05-27 06:18:17 +00:00
"no digits found for number, or misspelled number"});
2022-03-29 19:31:45 +00:00
}
2022-05-27 06:18:17 +00:00
if (isalnum(ch)) {
2022-03-29 19:31:45 +00:00
return PError({Span(tokenBegin, position),
2022-05-27 06:18:17 +00:00
"illegal digit or letter found at the end of number"});
2022-03-29 19:31:45 +00:00
}
2022-05-27 06:18:17 +00:00
return TokenType::NUMBER;
} else if (ch == '!') {
2022-03-29 19:31:45 +00:00
bump();
2022-05-27 06:18:17 +00:00
return TokenType::NOT;
} else if (ch == '<') {
2022-03-29 19:31:45 +00:00
bump();
2022-05-27 06:18:17 +00:00
return TokenType::LT;
} else if (ch == '>') {
2022-03-29 19:31:45 +00:00
bump();
2022-05-27 06:18:17 +00:00
return TokenType::GT;
} else if (ch == '#') {
2022-03-29 19:31:45 +00:00
bump();
if (digitIsValid(ch, Radix::HEX_NUMBER)) {
unsigned takenDigits = takeHexColor();
if (takenDigits != 6 && takenDigits != 8) {
return PError({Span(tokenBegin, position),
2022-05-27 06:18:17 +00:00
"hex-color must have 6 or 8 hex-digits"});
2022-03-29 19:31:45 +00:00
}
2022-05-27 06:18:17 +00:00
if (isalnum(ch)) {
2022-03-29 19:31:45 +00:00
return PError({Span(tokenBegin, position),
2022-05-27 06:18:17 +00:00
"illegal letter found at the end of hex-color"});
2022-03-29 19:31:45 +00:00
}
2022-05-27 06:18:17 +00:00
return TokenType::COLOR;
2022-03-29 19:31:45 +00:00
} else {
return PError({Span(tokenBegin, position),
2022-04-08 18:17:58 +00:00
"unexpected #"});
2022-03-29 19:31:45 +00:00
}
2022-05-27 06:18:17 +00:00
} else if (ch == '@') {
2022-03-29 19:31:45 +00:00
bump();
while (identifierContinue(ch))
bump();
2022-05-27 06:18:17 +00:00
return TokenType::KEYWORD;
} else if (ch == '"') {
2022-03-29 19:31:45 +00:00
return takeString();
2022-05-27 06:18:17 +00:00
} else if (ch == '`') {
2022-03-29 19:31:45 +00:00
return takeRawString();
2022-05-27 06:18:17 +00:00
} else if (ch == '(') {
2022-03-29 19:31:45 +00:00
bump();
2022-05-27 06:18:17 +00:00
return TokenType::LPAREN;
} else if (ch == ')') {
2022-03-29 19:31:45 +00:00
bump();
2022-05-27 06:18:17 +00:00
return TokenType::RPAREN;
} else if (ch == '[') {
2022-03-29 19:31:45 +00:00
bump();
return TokenType::LBRACKET;
2022-05-27 06:18:17 +00:00
} else if (ch == ']') {
2022-03-29 19:31:45 +00:00
bump();
2022-05-27 06:18:17 +00:00
return TokenType::RBRACKET;
} else if (ch == '{') {
2022-03-29 19:31:45 +00:00
bump();
return TokenType::LBRACE;
2022-05-27 06:18:17 +00:00
} else if (ch == '}') {
2022-03-29 19:31:45 +00:00
bump();
return TokenType::RBRACE;
2022-05-27 06:18:17 +00:00
} else if (ch == '/') {
2022-03-29 19:31:45 +00:00
bump();
2022-05-27 06:18:17 +00:00
if (ch == '/') {
2022-03-29 19:31:45 +00:00
bump();
skipComment();
return TokenType::COMMENT;
2022-05-27 06:18:17 +00:00
} else if (ch == '*') {
2022-03-29 19:31:45 +00:00
bump();
2022-05-27 06:18:17 +00:00
if (!skipMultilineComment()) {
2022-03-29 19:31:45 +00:00
return PError({Span(tokenBegin, position),
2022-05-27 06:18:17 +00:00
"unterminated multiline comment"});
2022-03-29 19:31:45 +00:00
}
return TokenType::COMMENT;
}
return PError({Span(tokenBegin, position),
2022-04-08 18:17:58 +00:00
"unexpected /"});
2022-05-27 06:18:17 +00:00
} else if (ch == '.') {
2022-03-29 19:31:45 +00:00
bump();
return TokenType::DOT;
2022-05-27 06:18:17 +00:00
} else if (ch == ':') {
2022-03-29 19:31:45 +00:00
bump();
return TokenType::COLON;
2022-05-27 06:18:17 +00:00
} else if (ch == ';') {
2022-03-29 19:31:45 +00:00
bump();
return TokenType::SEMICOLON;
2022-05-27 06:18:17 +00:00
} else if (ch == ',') {
2022-03-29 19:31:45 +00:00
bump();
return TokenType::COMMA;
2022-05-27 06:18:17 +00:00
} else if (ch == '=') {
2022-03-29 19:31:45 +00:00
bump();
return TokenType::EQUALS;
2022-05-27 06:18:17 +00:00
} else {
2022-03-29 19:31:45 +00:00
std::stringstream message;
message << "unexpected character `" << ch << "`";
bump();
return PError({Span(tokenBegin, position), message.str()});
}
}
// Move position to the new character in input file.
// Fetch the new character in 'ch'
void ComdelLexer::bump(unsigned count) {
2022-05-27 06:18:17 +00:00
for (unsigned i = 0; i < count && !eof(); i++) {
2022-03-29 19:31:45 +00:00
if (ch == '\n') {
position.line += 1;
position.col = 1;
2022-05-27 06:18:17 +00:00
parseContext->getFile(fileId).addLineOffset(position.offset + 1);
2022-03-29 19:31:45 +00:00
} else {
position.col += 1;
}
position.offset += 1;
ch = source[position.offset];
}
}
// Fetch and return next character without moving position.
// Fetch does not cross line boundary.
2022-04-08 18:17:58 +00:00
// Returns \n when next char does not exist (end of line or end of file)
2022-03-29 19:31:45 +00:00
char ComdelLexer::peek() {
2022-05-27 06:18:17 +00:00
if (position.offset + 1 == source.size()) // eof
2022-03-29 19:31:45 +00:00
return '\n';
2022-05-27 06:18:17 +00:00
return source[position.offset + 1];
2022-03-29 19:31:45 +00:00
}
bool ComdelLexer::eof() {
return position.offset == source.size();
}