387 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			387 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| #include "comdel_lexer.h"
 | |
| #include "token.h"
 | |
| #include "tokens_type.h"
 | |
| 
 | |
| #include <fstream>
 | |
| #include <cctype>
 | |
| 
 | |
| 
 | |
| /*************************************************************************
 | |
|  *
 | |
|  * Auxiliary functions - recognizing category of characters
 | |
|  *
 | |
| *************************************************************************/
 | |
| 
 | |
| inline bool identifierStart(char ch) {
 | |
|     return isalpha(ch) || ch == '_';
 | |
| }
 | |
| 
 | |
| inline bool identifierContinue(char ch) {
 | |
|     return isalnum(ch) || ch == '_';
 | |
| }
 | |
| 
 | |
| inline bool numberStart(char ch) {
 | |
|     return isdigit(ch) || ch == '-';
 | |
| }
 | |
| 
 | |
| inline bool isWhitespace(char ch) {
 | |
|     return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r';
 | |
| }
 | |
| 
 | |
| 
 | |
| /*************************************************************************
 | |
|  *
 | |
|  * ComdelLexer class public methods
 | |
|  *
 | |
| *************************************************************************/
 | |
| 
 | |
| ComdelLexer::ComdelLexer(std::string fileName, std::string source,
 | |
|                          ParseContext *parseContext)
 | |
|         : source(std::move(source)),
 | |
|           parseContext(parseContext),
 | |
|         // TODO Update this
 | |
|           fileId(this->parseContext->addFile(fileName, this->source)),
 | |
|           position(this->fileId, 1, 1, 0),
 | |
|           ch(this->source[0]) {}
 | |
| 
 | |
| 
 | |
| LexerResult ComdelLexer::tokenize() {
 | |
|     while (!eof()) {
 | |
|         tokenBegin = position;
 | |
| 
 | |
|         auto tokenType = nextTokenType();
 | |
|         if (!tokenType) {
 | |
|             errors.push_back(tokenType.error());
 | |
|             continue;
 | |
|         }
 | |
|         if (tokenType == TokenType::WHITESPACE || tokenType == TokenType::COMMENT) {
 | |
|             continue;
 | |
|         }
 | |
| 
 | |
|         std::string text = source.substr(tokenBegin.offset,
 | |
|                                          position.offset - tokenBegin.offset);
 | |
| 
 | |
|         tokenType = from_token(text, tokenType.value());
 | |
| 
 | |
|         tokens.emplace_back(*tokenType, Span(tokenBegin, position), text);
 | |
|     }
 | |
| 
 | |
|     tokens.emplace_back(TokenType::END_OF_FILE, Span(position), "");
 | |
| 
 | |
|     return LexerResult{tokens, errors};
 | |
| }
 | |
| 
 | |
| 
 | |
| /*************************************************************************
 | |
|  *
 | |
|  * ComdelLexer class private methods
 | |
|  *
 | |
| *************************************************************************/
 | |
| 
 | |
| 
 | |
| // Numbers are sequences of digits and underscores
 | |
| // with AT LEAST ONE digit.
 | |
| 
 | |
| unsigned ComdelLexer::takeNumberInRadix(Radix radix) {
 | |
|     unsigned digitsTaken = 0;
 | |
| 
 | |
|     while (true) {
 | |
|         if (digitIsValid(ch, radix)) { // skip and count real digits
 | |
|             bump();
 | |
|             digitsTaken++;
 | |
|         } else if (ch == '_') { // skip underscores
 | |
|             bump();
 | |
|         } else { // some other character => end of number
 | |
|             return digitsTaken;
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| 
 | |
| bool ComdelLexer::digitIsValid(char ch, Radix radix) {
 | |
| 
 | |
|     // Check valid digits
 | |
|     // digit is 0..1
 | |
|     if (ch == '1' || ch == '0') {
 | |
|         // OK for all radixes
 | |
|         return true;
 | |
|     }
 | |
|     //digit is 2..9
 | |
|     if (isdigit(ch)) {
 | |
|         // OK for decimal and hex radixes
 | |
|         return (radix != Radix::BIN_NUMBER);
 | |
|     }
 | |
|         // digit is a..f A..F
 | |
|     else if (isxdigit(ch)) {
 | |
|         // OK only for hex radix
 | |
|         return (radix == Radix::HEX_NUMBER);
 | |
|     }
 | |
|     // NOT 0..9 a..f A..F
 | |
|     return false;
 | |
| }
 | |
| 
 | |
| 
 | |
| ComdelLexer::Radix ComdelLexer::takeRadix() {
 | |
|     if (ch == '0') {
 | |
|         char nextChar = peek();
 | |
|         if (nextChar == 'x' || nextChar == 'X') {
 | |
|             bump(2);
 | |
|             return Radix::HEX_NUMBER;
 | |
|         } else if (nextChar == 'b' || nextChar == 'B') {
 | |
|             bump(2);
 | |
|             return Radix::BIN_NUMBER;
 | |
|         }
 | |
|     }
 | |
|     return Radix::DEC_NUMBER;
 | |
| }
 | |
| 
 | |
| 
 | |
| // color is sequence of hex-digits (preceded with # which is already consumed)
 | |
| unsigned ComdelLexer::takeHexColor() {
 | |
|     unsigned digitsTaken = 0;
 | |
| 
 | |
|     while (true) {
 | |
|         if (digitIsValid(ch, Radix::HEX_NUMBER)) { // skip and count hex digits
 | |
|             bump();
 | |
|             digitsTaken++;
 | |
|         } else { // some other character => end of number
 | |
|             return digitsTaken;
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| 
 | |
| PResult<TokenType> ComdelLexer::takeString() {
 | |
|     Position lo = position;
 | |
| 
 | |
|     if (ch != '"')
 | |
|         return PError({Span(lo), "expected string here"});
 | |
|     bump(); // skip starting "
 | |
| 
 | |
|     // Check escape-sequences  \t  \n  \\  \"  but leave them in string.
 | |
|     // They will be replaced in the constructor of StringLiteral AST-node.
 | |
|     while (ch != '"' && ch != '\n' && !eof()) {
 | |
|         if (ch == '"') {
 | |
|             // possible start of escape-sequence
 | |
|             char nextCh = peek();
 | |
|             if (nextCh == '\\' || nextCh == '\"' || nextCh == 't' || nextCh == 'n')
 | |
|                 bump();   // legal escape-sequence: skip backslash
 | |
|             else
 | |
|                 return PError({Span(lo, position), "illegal escape-sequence (allowed:  \\n  \\\"  \\\\  \\t"});
 | |
|         }
 | |
|         bump(); // skip normal char OR skip second char in escape-sequence
 | |
|     }
 | |
| 
 | |
|     if (eof() || ch == '\n') {
 | |
|         return PError({Span(lo, position), "unterminated string"});
 | |
|     }
 | |
|     bump(); // skip closing "
 | |
|     return TokenType::STRING;
 | |
| }
 | |
| 
 | |
| 
 | |
| PResult<TokenType> ComdelLexer::takeRawString() {
 | |
|     Position lo = position;
 | |
| 
 | |
|     if (ch != '`')
 | |
|         return PError({Span(lo), "expected string here"});
 | |
|     bump(); // skip starting '
 | |
| 
 | |
|     // Ignore escape-sequences - take all characters until closing '
 | |
|     while (ch != '`' && !eof()) {
 | |
|         bump(); // skip characters
 | |
|     }
 | |
| 
 | |
|     if (eof() || ch == '\n') {
 | |
|         return PError({Span(lo, position), "unterminated string"});
 | |
|     }
 | |
|     bump(); // skip closing '
 | |
|     return TokenType::STRING;
 | |
| }
 | |
| 
 | |
| 
 | |
| void ComdelLexer::skipWhitespace() {
 | |
|     while (!eof()) {
 | |
|         if (isWhitespace(ch)) {
 | |
|             bump();
 | |
|         } else {
 | |
|             break;
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| 
 | |
| void ComdelLexer::skipComment() {
 | |
|     while (!eof() && ch != '\n') {
 | |
|         bump();
 | |
|     }
 | |
| }
 | |
| 
 | |
| 
 | |
| // returns false if the comment is unterminated
 | |
| bool ComdelLexer::skipMultilineComment() {
 | |
|     while (!eof()) {
 | |
|         if (ch == '*') {
 | |
|             bump();
 | |
|             if (ch == '/') {
 | |
|                 bump();
 | |
|                 return true;
 | |
|             }
 | |
|         }
 | |
|         bump();
 | |
|     }
 | |
|     return false;
 | |
| }
 | |
| 
 | |
| 
 | |
| PResult<TokenType> ComdelLexer::nextTokenType() {
 | |
|     if (isWhitespace(ch)) {
 | |
|         skipWhitespace();
 | |
|         return TokenType::WHITESPACE;
 | |
|     } else if (identifierStart(ch)) {
 | |
|         bump();
 | |
|         while (identifierContinue(ch))
 | |
|             bump();
 | |
|         return TokenType::IDENTIFIER;
 | |
|     } else if (numberStart(ch)) {
 | |
|         if (ch == '-') {
 | |
|             bump();
 | |
|         }
 | |
|         unsigned takenDigits;
 | |
|         Radix radix = takeRadix();
 | |
|         takenDigits = takeNumberInRadix(radix);
 | |
|         if (takenDigits == 0) {
 | |
|             return PError({Span(tokenBegin, position),
 | |
|                            "no digits found for number, or misspelled number"});
 | |
|         }
 | |
|         if (isalnum(ch)) {
 | |
|             return PError({Span(tokenBegin, position),
 | |
|                            "illegal digit or letter found at the end of number"});
 | |
|         }
 | |
|         return TokenType::NUMBER;
 | |
|     } else if (ch == '!') {
 | |
|         bump();
 | |
|         return TokenType::NOT;
 | |
|     } else if (ch == '<') {
 | |
|         bump();
 | |
|         return TokenType::LT;
 | |
|     } else if (ch == '>') {
 | |
|         bump();
 | |
|         return TokenType::GT;
 | |
|     } else if (ch == '#') {
 | |
|         bump();
 | |
| 
 | |
|         if (digitIsValid(ch, Radix::HEX_NUMBER)) {
 | |
|             unsigned takenDigits = takeHexColor();
 | |
|             if (takenDigits != 6 && takenDigits != 8) {
 | |
|                 return PError({Span(tokenBegin, position),
 | |
|                                "hex-color must have 6 or 8 hex-digits"});
 | |
|             }
 | |
|             if (isalnum(ch)) {
 | |
|                 return PError({Span(tokenBegin, position),
 | |
|                                "illegal letter found at the end of hex-color"});
 | |
|             }
 | |
|             return TokenType::COLOR;
 | |
|         } else {
 | |
|             return PError({Span(tokenBegin, position),
 | |
|                            "unexpected #"});
 | |
|         }
 | |
|     } else if (ch == '@') {
 | |
|         bump();
 | |
|         while (identifierContinue(ch))
 | |
|             bump();
 | |
|         return TokenType::KEYWORD;
 | |
|     } else if (ch == '"') {
 | |
|         return takeString();
 | |
|     } else if (ch == '`') {
 | |
|         return takeRawString();
 | |
|     } else if (ch == '(') {
 | |
|         bump();
 | |
|         return TokenType::LPAREN;
 | |
|     } else if (ch == ')') {
 | |
|         bump();
 | |
|         return TokenType::RPAREN;
 | |
|     } else if (ch == '[') {
 | |
|         bump();
 | |
|         return TokenType::LBRACKET;
 | |
|     } else if (ch == ']') {
 | |
|         bump();
 | |
|         return TokenType::RBRACKET;
 | |
|     } else if (ch == '{') {
 | |
|         bump();
 | |
|         return TokenType::LBRACE;
 | |
|     } else if (ch == '}') {
 | |
|         bump();
 | |
|         return TokenType::RBRACE;
 | |
|     } else if (ch == '/') {
 | |
|         bump();
 | |
|         if (ch == '/') {
 | |
|             bump();
 | |
|             skipComment();
 | |
|             return TokenType::COMMENT;
 | |
|         } else if (ch == '*') {
 | |
|             bump();
 | |
|             if (!skipMultilineComment()) {
 | |
|                 return PError({Span(tokenBegin, position),
 | |
|                                "unterminated multiline comment"});
 | |
|             }
 | |
|             return TokenType::COMMENT;
 | |
|         }
 | |
|         return PError({Span(tokenBegin, position),
 | |
|                        "unexpected /"});
 | |
|     } else if (ch == '.') {
 | |
|         bump();
 | |
|         return TokenType::DOT;
 | |
|     } else if (ch == ':') {
 | |
|         bump();
 | |
|         return TokenType::COLON;
 | |
|     } else if (ch == ';') {
 | |
|         bump();
 | |
|         return TokenType::SEMICOLON;
 | |
|     } else if (ch == ',') {
 | |
|         bump();
 | |
|         return TokenType::COMMA;
 | |
|     } else if (ch == '=') {
 | |
|         bump();
 | |
|         return TokenType::EQUALS;
 | |
|     } else {
 | |
|         std::stringstream message;
 | |
|         message << "unexpected character `" << ch << "`";
 | |
|         bump();
 | |
|         return PError({Span(tokenBegin, position), message.str()});
 | |
|     }
 | |
| }
 | |
| 
 | |
| 
 | |
| // Move position to the new character in input file.
 | |
| // Fetch the new character in 'ch'
 | |
| void ComdelLexer::bump(unsigned count) {
 | |
|     for (unsigned i = 0; i < count && !eof(); i++) {
 | |
|         if (ch == '\n') {
 | |
|             position.line += 1;
 | |
|             position.col = 1;
 | |
|             parseContext->getFile(fileId).addLineOffset(position.offset + 1);
 | |
|         } else {
 | |
|             position.col += 1;
 | |
|         }
 | |
|         position.offset += 1;
 | |
|         ch = source[position.offset];
 | |
|     }
 | |
| }
 | |
| 
 | |
| 
 | |
| // Fetch and return next character without moving position.
 | |
| // Fetch does not cross line boundary.
 | |
| // Returns \n when next char does not exist (end of line or end of file)
 | |
| char ComdelLexer::peek() {
 | |
|     if (position.offset + 1 == source.size()) // eof
 | |
|         return '\n';
 | |
| 
 | |
|     return source[position.offset + 1];
 | |
| }
 | |
| 
 | |
| 
 | |
| bool ComdelLexer::eof() {
 | |
|     return position.offset == source.size();
 | |
| }
 |