444 lines
10 KiB
C++
444 lines
10 KiB
C++
#include "comdellexer.h"
|
|
#include "token.h"
|
|
#include "tokenstype.h"
|
|
|
|
#include <fstream>
|
|
#include <cctype>
|
|
|
|
|
|
/*************************************************************************
|
|
*
|
|
* Auxiliary functions - recognizing category of characters
|
|
*
|
|
*************************************************************************/
|
|
|
|
inline bool identifierStart(char ch) {
|
|
return isalpha(ch) || ch == '_';
|
|
}
|
|
|
|
inline bool identifierContinue(char ch) {
|
|
return isalnum(ch) || ch == '_';
|
|
}
|
|
|
|
inline bool numberStart(char ch) {
|
|
return isdigit(ch);
|
|
}
|
|
|
|
inline bool isWhitespace(char ch) {
|
|
return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r';
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
*
|
|
* ComdelLexer class public methods
|
|
*
|
|
*************************************************************************/
|
|
|
|
ComdelLexer::ComdelLexer(std::string fileName, std::string source,
|
|
ParseContext* parseContext)
|
|
: source(std::move(source)),
|
|
parseContext(parseContext),
|
|
// qqq zamijeniti poziv path_absolute sa Qt-ovim pozivom
|
|
fileId(this->parseContext->addFile(fileName, this->source)),
|
|
position(this->fileId, 1, 1, 0),
|
|
ch(this->source[0])
|
|
{}
|
|
|
|
|
|
LexerResult ComdelLexer::tokenize() {
|
|
while (!eof()) {
|
|
tokenBegin = position;
|
|
|
|
auto tokenType = nextTokenType();
|
|
if(!tokenType) {
|
|
errors.push_back(tokenType.error());
|
|
continue;
|
|
}
|
|
if (tokenType == TokenType::WHITESPACE || tokenType == TokenType::COMMENT) {
|
|
continue;
|
|
}
|
|
|
|
std::string text = source.substr(tokenBegin.offset,
|
|
position.offset - tokenBegin.offset);
|
|
|
|
tokens.push_back(Token(*tokenType, Span(tokenBegin, position), text));
|
|
}
|
|
|
|
tokens.push_back(Token( TokenType::END_OF_FILE, Span(position), ""));
|
|
|
|
return LexerResult { tokens, errors };
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
*
|
|
* ComdelLexer class private methods
|
|
*
|
|
*************************************************************************/
|
|
|
|
|
|
// Numbers are sequences of digits and underscores
|
|
// with AT LEAST ONE digit.
|
|
|
|
unsigned ComdelLexer::takeNumberInRadix(Radix radix) {
|
|
unsigned digitsTaken = 0;
|
|
|
|
while(true) {
|
|
if (digitIsValid(ch, radix)) { // skip and count real digits
|
|
bump();
|
|
digitsTaken++;
|
|
} else if(ch == '_') { // skip underscores
|
|
bump();
|
|
} else { // some other character => end of number
|
|
return digitsTaken;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
bool ComdelLexer::digitIsValid(char ch, Radix radix) {
|
|
|
|
// Check digits in normal numbers
|
|
//digit is 0..1
|
|
if(ch == '1' || ch == '0') {
|
|
// OK for all radixes
|
|
return true;
|
|
}
|
|
//digit is 2..9
|
|
if(isdigit(ch)) {
|
|
// OK for decimal and hex radixes
|
|
return(radix != Radix::BIN_NUMBER);
|
|
}
|
|
// digit is a..f A..F
|
|
else if(isxdigit(ch)) {
|
|
// OK only for hex radix
|
|
return(radix ==Radix::HEX_NUMBER);
|
|
}
|
|
// NOT 0..9 a..f A..F
|
|
return false;
|
|
}
|
|
|
|
|
|
ComdelLexer::Radix ComdelLexer::takeRadix(){
|
|
if (ch == '0') {
|
|
char nextChar = peek();
|
|
if (nextChar == 'x' || nextChar == 'X') {
|
|
bump(2);
|
|
return Radix::HEX_NUMBER;
|
|
} else if (nextChar == 'b' || nextChar == 'B') {
|
|
bump(2);
|
|
return Radix::BIN_NUMBER;
|
|
}
|
|
}
|
|
return Radix::DEC_NUMBER;
|
|
}
|
|
|
|
|
|
// color is sequence of hex-digits (preceeded with # which is already consumed)
|
|
unsigned ComdelLexer::takeHexColor() {
|
|
unsigned digitsTaken = 0;
|
|
|
|
while(true) {
|
|
if (digitIsValid(ch, Radix::HEX_NUMBER)) { // skip and count hex digits
|
|
bump();
|
|
digitsTaken++;
|
|
} else { // some other character => end of number
|
|
return digitsTaken;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
PResult<TokenType> ComdelLexer::takeString() {
|
|
Position lo = position;
|
|
|
|
if (ch != '"')
|
|
return PError({Span(lo), "expected string here"});
|
|
bump(); // skip starting "
|
|
|
|
// Check ecsape-sequences \t \n \\ \" but leave them in string.
|
|
// They will be replaced in the constructor of StringLiteral AST-node.
|
|
while (ch != '"' && ch != '\n' && !eof()) {
|
|
if(ch == '"') {
|
|
// possible start of escape-sequence
|
|
char nextCh = peek();
|
|
if(nextCh == '\\' || nextCh == '\"' || nextCh == 't' || nextCh == 'n')
|
|
bump(); // legal escape-sequence: skip backslash
|
|
else
|
|
return PError({Span(lo, position), "illegal escape-sequence (allowed: \\n \\\" \\\\ \\t"});
|
|
}
|
|
bump(); // skip normal char OR skip second char in escape-sequence
|
|
}
|
|
|
|
if (eof() || ch == '\n') {
|
|
return PError({Span(lo, position), "unterminated string"});
|
|
}
|
|
bump(); // skip closing "
|
|
return TokenType::STRING;
|
|
}
|
|
|
|
|
|
PResult<TokenType> ComdelLexer::takeRawString()
|
|
{
|
|
Position lo = position;
|
|
|
|
if (ch != '`')
|
|
return PError({Span(lo), "expected string here"});
|
|
bump(); // skip starting '
|
|
|
|
// Ignore ecsape-sequences - take all characters until closing '
|
|
while (ch != '`' && !eof()) {
|
|
bump(); // skip characters
|
|
}
|
|
|
|
if (eof() || ch == '\n') {
|
|
return PError({Span(lo, position), "unterminated string"});
|
|
}
|
|
bump(); // skip closing '
|
|
return TokenType::STRING;
|
|
}
|
|
|
|
|
|
void ComdelLexer::skipWhitespace() {
|
|
while (!eof()) {
|
|
if (isWhitespace(ch)) {
|
|
bump();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void ComdelLexer::skipComment() {
|
|
while (!eof() && ch != '\n') {
|
|
bump();
|
|
}
|
|
}
|
|
|
|
|
|
// returns false if the comment is unterminated
|
|
bool ComdelLexer::skipMultilineComment() {
|
|
while (!eof()) {
|
|
if (ch == '*') {
|
|
bump();
|
|
if (ch == '/') {
|
|
bump();
|
|
return true;
|
|
}
|
|
}
|
|
bump();
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
PResult<TokenType> ComdelLexer::nextTokenType() {
|
|
if (isWhitespace(ch))
|
|
{
|
|
skipWhitespace();
|
|
return TokenType::WHITESPACE;
|
|
}
|
|
else if (identifierStart(ch))
|
|
{
|
|
bump();
|
|
while (identifierContinue(ch))
|
|
bump();
|
|
return TokenType::IDENTIFIER;
|
|
}
|
|
else if (numberStart(ch))
|
|
{
|
|
unsigned takenDigits;
|
|
Radix radix = takeRadix();
|
|
takenDigits = takeNumberInRadix(radix);
|
|
if (takenDigits == 0) {
|
|
return PError({Span(tokenBegin, position),
|
|
"no digits found for number, or misspelled number"});
|
|
}
|
|
if( isalnum(ch) ) {
|
|
return PError({Span(tokenBegin, position),
|
|
"illegal digit or letter found at the end of number"});
|
|
}
|
|
return TokenType::NUMBER;
|
|
}
|
|
else if (ch == '!')
|
|
{
|
|
bump();
|
|
return TokenType::NOT;
|
|
}
|
|
|
|
else if (ch == '<')
|
|
{
|
|
bump();
|
|
return TokenType::LT;
|
|
}
|
|
|
|
else if (ch == '>')
|
|
{
|
|
bump();
|
|
return TokenType::GT;
|
|
}
|
|
|
|
else if (ch == '#')
|
|
{
|
|
bump();
|
|
|
|
if (digitIsValid(ch, Radix::HEX_NUMBER)) {
|
|
unsigned takenDigits = takeHexColor();
|
|
if (takenDigits != 6 && takenDigits != 8) {
|
|
return PError({Span(tokenBegin, position),
|
|
"hex-color must have 6 or 8 hex-digits"});
|
|
}
|
|
if( isalnum(ch) ) {
|
|
return PError({Span(tokenBegin, position),
|
|
"illegal letter found at the end of hex-color"});
|
|
}
|
|
return TokenType::COLOR;
|
|
} else {
|
|
return PError({Span(tokenBegin, position),
|
|
"unexpected #"});;
|
|
}
|
|
}
|
|
else if (ch == '@')
|
|
{
|
|
bump();
|
|
while (identifierContinue(ch))
|
|
bump();
|
|
return TokenType::KEYWORD;
|
|
}
|
|
else if (ch == '"')
|
|
{
|
|
return takeString();
|
|
}
|
|
else if (ch == '`')
|
|
{
|
|
return takeRawString();
|
|
}
|
|
else if (ch == '(')
|
|
{
|
|
bump();
|
|
return TokenType::LPAREN;
|
|
}
|
|
else if (ch == ')')
|
|
{
|
|
bump();
|
|
return TokenType::RPAREN;
|
|
}
|
|
else if (ch == '[')
|
|
{
|
|
bump();
|
|
return TokenType::LBRACKET;
|
|
}
|
|
else if (ch == ']')
|
|
{
|
|
bump();
|
|
return TokenType::RBRACKET;
|
|
}
|
|
else if (ch == '{')
|
|
{
|
|
bump();
|
|
return TokenType::LBRACE;
|
|
}
|
|
else if (ch == '}')
|
|
{
|
|
bump();
|
|
return TokenType::RBRACE;
|
|
}
|
|
else if (ch == '/')
|
|
{
|
|
bump();
|
|
if (ch == '/')
|
|
{
|
|
bump();
|
|
skipComment();
|
|
return TokenType::COMMENT;
|
|
} else if (ch == '*')
|
|
{
|
|
bump();
|
|
if (!skipMultilineComment())
|
|
{
|
|
return PError({Span(tokenBegin, position),
|
|
"unterminated multiline comment"});
|
|
}
|
|
return TokenType::COMMENT;
|
|
}
|
|
return PError({Span(tokenBegin, position),
|
|
"unexpected /"});;
|
|
}
|
|
else if (ch == '.')
|
|
{
|
|
bump();
|
|
return TokenType::DOT;
|
|
}
|
|
else if (ch == ':')
|
|
{
|
|
bump();
|
|
return TokenType::COLON;
|
|
}
|
|
else if (ch == ';')
|
|
{
|
|
bump();
|
|
return TokenType::SEMICOLON;
|
|
}
|
|
else if (ch == ',')
|
|
{
|
|
bump();
|
|
return TokenType::COMMA;
|
|
}
|
|
else if (ch == '=')
|
|
{
|
|
bump();
|
|
return TokenType::EQUALS;
|
|
}
|
|
else if (ch == '<')
|
|
{
|
|
bump();
|
|
return TokenType::LT;
|
|
}
|
|
else if (ch == '>')
|
|
{
|
|
return TokenType::GT;
|
|
}
|
|
else
|
|
{
|
|
std::stringstream message;
|
|
message << "unexpected character `" << ch << "`";
|
|
bump();
|
|
return PError({Span(tokenBegin, position), message.str()});
|
|
}
|
|
}
|
|
|
|
|
|
// Move position to the new character in input file.
|
|
// Fetch the new character in 'ch'
|
|
void ComdelLexer::bump(unsigned count) {
|
|
for (unsigned i=0; i < count && !eof(); i++) {
|
|
if (ch == '\n') {
|
|
position.line += 1;
|
|
position.col = 1;
|
|
parseContext->getFile(fileId).addLineOffset(position.offset+1);
|
|
} else {
|
|
position.col += 1;
|
|
}
|
|
position.offset += 1;
|
|
ch = source[position.offset];
|
|
}
|
|
}
|
|
|
|
|
|
// Fetch and return next character without moving position.
|
|
// Fetch does not cross line boundary.
|
|
// Returns \n when next char does not exists (end of line or end of file)
|
|
char ComdelLexer::peek() {
|
|
if(position.offset+1 == source.size()) // eof
|
|
return '\n';
|
|
|
|
return source[position.offset+1];
|
|
}
|
|
|
|
|
|
bool ComdelLexer::eof() {
|
|
return position.offset == source.size();
|
|
}
|