#include "comdel_lexer.h" #include "token.h" #include "tokens_type.h" #include #include /************************************************************************* * * Auxiliary functions - recognizing category of characters * *************************************************************************/ inline bool identifierStart(char ch) { return isalpha(ch) || ch == '_'; } inline bool identifierContinue(char ch) { return isalnum(ch) || ch == '_'; } inline bool numberStart(char ch) { return isdigit(ch) || ch == '-'; } inline bool isWhitespace(char ch) { return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'; } /************************************************************************* * * ComdelLexer class public methods * *************************************************************************/ ComdelLexer::ComdelLexer(std::string fileName, std::string source, ParseContext *parseContext) : source(std::move(source)), parseContext(parseContext), // TODO Update this fileId(this->parseContext->addFile(fileName, this->source)), position(this->fileId, 1, 1, 0), ch(this->source[0]) {} LexerResult ComdelLexer::tokenize() { while (!eof()) { tokenBegin = position; auto tokenType = nextTokenType(); if (!tokenType) { errors.push_back(tokenType.error()); continue; } if (tokenType == TokenType::WHITESPACE || tokenType == TokenType::COMMENT) { continue; } std::string text = source.substr(tokenBegin.offset, position.offset - tokenBegin.offset); tokenType = from_token(text, tokenType.value()); tokens.emplace_back(*tokenType, Span(tokenBegin, position), text); } tokens.emplace_back(TokenType::END_OF_FILE, Span(position), ""); return LexerResult{tokens, errors}; } /************************************************************************* * * ComdelLexer class private methods * *************************************************************************/ // Numbers are sequences of digits and underscores // with AT LEAST ONE digit. unsigned ComdelLexer::takeNumberInRadix(Radix radix) { unsigned digitsTaken = 0; while (true) { if (digitIsValid(ch, radix)) { // skip and count real digits bump(); digitsTaken++; } else if (ch == '_') { // skip underscores bump(); } else { // some other character => end of number return digitsTaken; } } } bool ComdelLexer::digitIsValid(char ch, Radix radix) { // Check valid digits // digit is 0..1 if (ch == '1' || ch == '0') { // OK for all radixes return true; } //digit is 2..9 if (isdigit(ch)) { // OK for decimal and hex radixes return (radix != Radix::BIN_NUMBER); } // digit is a..f A..F else if (isxdigit(ch)) { // OK only for hex radix return (radix == Radix::HEX_NUMBER); } // NOT 0..9 a..f A..F return false; } ComdelLexer::Radix ComdelLexer::takeRadix() { if (ch == '0') { char nextChar = peek(); if (nextChar == 'x' || nextChar == 'X') { bump(2); return Radix::HEX_NUMBER; } else if (nextChar == 'b' || nextChar == 'B') { bump(2); return Radix::BIN_NUMBER; } } return Radix::DEC_NUMBER; } // color is sequence of hex-digits (preceded with # which is already consumed) unsigned ComdelLexer::takeHexColor() { unsigned digitsTaken = 0; while (true) { if (digitIsValid(ch, Radix::HEX_NUMBER)) { // skip and count hex digits bump(); digitsTaken++; } else { // some other character => end of number return digitsTaken; } } } PResult ComdelLexer::takeString() { Position lo = position; if (ch != '"') return PError({Span(lo), "expected string here"}); bump(); // skip starting " // Check escape-sequences \t \n \\ \" but leave them in string. // They will be replaced in the constructor of StringLiteral AST-node. while (ch != '"' && ch != '\n' && !eof()) { if (ch == '"') { // possible start of escape-sequence char nextCh = peek(); if (nextCh == '\\' || nextCh == '\"' || nextCh == 't' || nextCh == 'n') bump(); // legal escape-sequence: skip backslash else return PError({Span(lo, position), "illegal escape-sequence (allowed: \\n \\\" \\\\ \\t"}); } bump(); // skip normal char OR skip second char in escape-sequence } if (eof() || ch == '\n') { return PError({Span(lo, position), "unterminated string"}); } bump(); // skip closing " return TokenType::STRING; } PResult ComdelLexer::takeRawString() { Position lo = position; if (ch != '`') return PError({Span(lo), "expected string here"}); bump(); // skip starting ' // Ignore escape-sequences - take all characters until closing ' while (ch != '`' && !eof()) { bump(); // skip characters } if (eof() || ch == '\n') { return PError({Span(lo, position), "unterminated string"}); } bump(); // skip closing ' return TokenType::STRING; } void ComdelLexer::skipWhitespace() { while (!eof()) { if (isWhitespace(ch)) { bump(); } else { break; } } } void ComdelLexer::skipComment() { while (!eof() && ch != '\n') { bump(); } } // returns false if the comment is unterminated bool ComdelLexer::skipMultilineComment() { while (!eof()) { if (ch == '*') { bump(); if (ch == '/') { bump(); return true; } } bump(); } return false; } PResult ComdelLexer::nextTokenType() { if (isWhitespace(ch)) { skipWhitespace(); return TokenType::WHITESPACE; } else if (identifierStart(ch)) { bump(); while (identifierContinue(ch)) bump(); return TokenType::IDENTIFIER; } else if (numberStart(ch)) { if (ch == '-') { bump(); } unsigned takenDigits; Radix radix = takeRadix(); takenDigits = takeNumberInRadix(radix); if (takenDigits == 0) { return PError({Span(tokenBegin, position), "no digits found for number, or misspelled number"}); } if (isalnum(ch)) { return PError({Span(tokenBegin, position), "illegal digit or letter found at the end of number"}); } return TokenType::NUMBER; } else if (ch == '!') { bump(); return TokenType::NOT; } else if (ch == '<') { bump(); return TokenType::LT; } else if (ch == '>') { bump(); return TokenType::GT; } else if (ch == '#') { bump(); if (digitIsValid(ch, Radix::HEX_NUMBER)) { unsigned takenDigits = takeHexColor(); if (takenDigits != 6 && takenDigits != 8) { return PError({Span(tokenBegin, position), "hex-color must have 6 or 8 hex-digits"}); } if (isalnum(ch)) { return PError({Span(tokenBegin, position), "illegal letter found at the end of hex-color"}); } return TokenType::COLOR; } else { return PError({Span(tokenBegin, position), "unexpected #"}); } } else if (ch == '@') { bump(); while (identifierContinue(ch)) bump(); return TokenType::KEYWORD; } else if (ch == '"') { return takeString(); } else if (ch == '`') { return takeRawString(); } else if (ch == '(') { bump(); return TokenType::LPAREN; } else if (ch == ')') { bump(); return TokenType::RPAREN; } else if (ch == '[') { bump(); return TokenType::LBRACKET; } else if (ch == ']') { bump(); return TokenType::RBRACKET; } else if (ch == '{') { bump(); return TokenType::LBRACE; } else if (ch == '}') { bump(); return TokenType::RBRACE; } else if (ch == '/') { bump(); if (ch == '/') { bump(); skipComment(); return TokenType::COMMENT; } else if (ch == '*') { bump(); if (!skipMultilineComment()) { return PError({Span(tokenBegin, position), "unterminated multiline comment"}); } return TokenType::COMMENT; } return PError({Span(tokenBegin, position), "unexpected /"}); } else if (ch == '.') { bump(); return TokenType::DOT; } else if (ch == ':') { bump(); return TokenType::COLON; } else if (ch == ';') { bump(); return TokenType::SEMICOLON; } else if (ch == ',') { bump(); return TokenType::COMMA; } else if (ch == '=') { bump(); return TokenType::EQUALS; } else { std::stringstream message; message << "unexpected character `" << ch << "`"; bump(); return PError({Span(tokenBegin, position), message.str()}); } } // Move position to the new character in input file. // Fetch the new character in 'ch' void ComdelLexer::bump(unsigned count) { for (unsigned i = 0; i < count && !eof(); i++) { if (ch == '\n') { position.line += 1; position.col = 1; parseContext->getFile(fileId).addLineOffset(position.offset + 1); } else { position.col += 1; } position.offset += 1; ch = source[position.offset]; } } // Fetch and return next character without moving position. // Fetch does not cross line boundary. // Returns \n when next char does not exist (end of line or end of file) char ComdelLexer::peek() { if (position.offset + 1 == source.size()) // eof return '\n'; return source[position.offset + 1]; } bool ComdelLexer::eof() { return position.offset == source.size(); }