from cpl_query.extension.list import List from cc_lang.model.language_definition import LanguageDefinition from lexer.abc.lexer_abc import LexerABC from lexer.model.token import Token from lexer.model.token_types import UnresolvedTokenTypes, TokenTypes from runtime.abc.runtime_service_abc import RuntimeServiceABC class LexerService(LexerABC): def __init__(self, runtime: RuntimeServiceABC): self._runtime = runtime self._is_ml_comment = False def _add_tok(self, tokens: List[Token], value: str, input_token_type: UnresolvedTokenTypes) -> None: """ Creates token object :param value: :param input_token_type: :return: """ token_type: TokenTypes = TokenTypes.Empty if value != '': if input_token_type == UnresolvedTokenTypes.Word: if value in LanguageDefinition.keywords: token_type = TokenTypes.Keyword elif value in LanguageDefinition.datatypes: token_type = TokenTypes.Type elif value in LanguageDefinition.bool_values: token_type = TokenTypes.Bool elif value == UnresolvedTokenTypes.Empty: token_type = TokenTypes.Empty else: token_type = TokenTypes.Name elif input_token_type == UnresolvedTokenTypes.Number: token_type = TokenTypes.Number elif input_token_type == UnresolvedTokenTypes.String: token_type = TokenTypes.String elif input_token_type == UnresolvedTokenTypes.Expression_Character: token_type = TokenTypes.Expression_Character elif input_token_type == UnresolvedTokenTypes.Bool_Expression_Character: token_type = TokenTypes.Bool_Expression_Character elif input_token_type == UnresolvedTokenTypes.Format_Character: token_type = TokenTypes.Format_Character tokens.append(Token(token_type, value)) def tokenize(self, line: str) -> List[Token]: tokens: List[Token] = List(Token) word = '' ol_comment = False is_string1 = False # 'hello' is_string2 = False # "hello" is_number = False is_expr_char = False for i in range(0, len(line)): c = line[i] # ignore comments and spaces if not ol_comment and not self._is_ml_comment: # comment filtering if c == '#' and not is_string1 and not is_string2: ol_comment = True elif line[i - 1] == '/' and c == '/': ol_comment = True elif line[i - 1] == '/' and c == '*': self._is_ml_comment = True i += 2 # end of number elif not c.isdigit() and c != '.' and is_number: self._add_tok(tokens, word, UnresolvedTokenTypes.Number) local_tokens = self.tokenize(c) for local_token in local_tokens: tokens.append(local_token) word = '' is_number = False # end of expression char elif c not in LanguageDefinition.expr_chars and is_expr_char: self._add_tok(tokens, word, UnresolvedTokenTypes.Expression_Character) word = '' is_expr_char = False # begin of is_string1 elif c == '\'' and not is_string1: is_string1 = True word = '' # end of is_string1 elif c == '\'' and is_string1: is_string1 = False self._add_tok(tokens, word, UnresolvedTokenTypes.String) word = '' # begin of is_string2 elif c == '\"' and not is_string2: is_string2 = True word = '' # end of is_string2 elif c == '\"' and is_string2: is_string2 = False self._add_tok(tokens, word, UnresolvedTokenTypes.String) word = '' # format char elif c in LanguageDefinition.format_chars: self._add_tok(tokens, word, UnresolvedTokenTypes.Word) self._add_tok(tokens, c, UnresolvedTokenTypes.Format_Character) word = '' # begin of number elif c.isdigit() and not is_number and word == '': word += c is_number = True # continue number elif (c.isdigit() or c == '.') and is_number: word += c # begin expression char elif c in LanguageDefinition.expr_chars and not is_expr_char: word += c is_expr_char = True # continue expression char elif c in LanguageDefinition.expr_chars and is_expr_char: word += c # bool expression char elif c in LanguageDefinition.bool_expr_chars: self._add_tok(tokens, word, UnresolvedTokenTypes.Word) self._add_tok(tokens, c, UnresolvedTokenTypes.Bool_Expression_Character) word = '' # end of word elif c == ' ' and not is_string1 and not is_string2 or c == '\n': self._add_tok(tokens, word, UnresolvedTokenTypes.Word) word = '' else: word += c if c == '\n' and ol_comment: ol_comment = False if line[i - 1] == '*' and c == '/': self._is_ml_comment = False return tokens