Added lexer

2021-08-14 16:41:25 +02:00
parent 4a3f9b4b9d
commit c909ce31d8
18 changed files with 438 additions and 210 deletions
--- a/src/lexer/service/lexer_service.py
+++ b/src/lexer/service/lexer_service.py
@@ -1,7 +1,166 @@
+from cpl_query.extension.list import List
+
+from cc_lang.model.language_definition import LanguageDefinition
 from lexer.abc.lexer_abc import LexerABC
+from lexer.model.token import Token
+from lexer.model.token_types import UnresolvedTokenTypes, TokenTypes
+from runtime.abc.runtime_service_abc import RuntimeServiceABC


 class LexerService(LexerABC):

-    def __init__(self):
-        pass
+    def __init__(self, runtime: RuntimeServiceABC):
+        self._runtime = runtime
+        
+        self._is_ml_comment = False
+
+    def _add_tok(self, tokens: List[Token], value: str, input_token_type: UnresolvedTokenTypes) -> None:
+        """
+        Creates token object
+        :param value:
+        :param input_token_type:
+        :return:
+        """
+        token_type: TokenTypes = TokenTypes.Empty
+
+        if value != '':
+            if input_token_type == UnresolvedTokenTypes.Word:
+                if value in LanguageDefinition.keywords:
+                    token_type = TokenTypes.Keyword
+
+                elif value in LanguageDefinition.datatypes:
+                    token_type = TokenTypes.Type
+
+                elif value in LanguageDefinition.bool_values:
+                    token_type = TokenTypes.Bool
+
+                elif value == UnresolvedTokenTypes.Empty:
+                    token_type = TokenTypes.Empty
+
+                else:
+                    token_type = TokenTypes.Name
+
+            elif input_token_type == UnresolvedTokenTypes.Number:
+                token_type = TokenTypes.Number
+
+            elif input_token_type == UnresolvedTokenTypes.String:
+                token_type = TokenTypes.String
+
+            elif input_token_type == UnresolvedTokenTypes.Expression_Character:
+                token_type = TokenTypes.Expression_Character
+
+            elif input_token_type == UnresolvedTokenTypes.Bool_Expression_Character:
+                token_type = TokenTypes.Bool_Expression_Character
+
+            elif input_token_type == UnresolvedTokenTypes.Format_Character:
+                token_type = TokenTypes.Format_Character
+
+            tokens.append(Token(token_type, value))
+
+    def tokenize(self, line: str) -> List[Token]:
+        tokens: List[Token] = List(Token)
+        word = ''
+        ol_comment = False
+        is_string1 = False  # 'hello'
+        is_string2 = False  # "hello"
+        is_number = False
+        is_expr_char = False
+
+        for i in range(0, len(line)):
+            c = line[i]
+            # ignore comments and spaces
+            if not ol_comment and not self._is_ml_comment:
+                # comment filtering
+                if c == '#' and not is_string1 and not is_string2:
+                    ol_comment = True
+
+                elif line[i - 1] == '/' and c == '/':
+                    ol_comment = True
+
+                elif line[i - 1] == '/' and c == '*':
+                    self._is_ml_comment = True
+                    i += 2
+
+                # end of number
+                elif not c.isdigit() and c != '.' and is_number:
+                    self._add_tok(tokens, word, UnresolvedTokenTypes.Number)
+                    local_tokens = self.tokenize(c)
+                    for local_token in local_tokens:
+                        tokens.append(local_token)
+
+                    word = ''
+                    is_number = False
+
+                # end of expression char
+                elif c not in LanguageDefinition.expr_chars and is_expr_char:
+                    self._add_tok(tokens, word, UnresolvedTokenTypes.Expression_Character)
+                    word = ''
+                    is_expr_char = False
+
+                # begin of is_string1
+                elif c == '\'' and not is_string1:
+                    is_string1 = True
+                    word = ''
+
+                # end of is_string1
+                elif c == '\'' and is_string1:
+                    is_string1 = False
+                    self._add_tok(tokens, word, UnresolvedTokenTypes.String)
+                    word = ''
+
+                # begin of is_string2
+                elif c == '\"' and not is_string2:
+                    is_string2 = True
+                    word = ''
+
+                # end of is_string2
+                elif c == '\"' and is_string2:
+                    is_string2 = False
+                    self._add_tok(tokens, word, UnresolvedTokenTypes.String)
+                    word = ''
+
+                # format char
+                elif c in LanguageDefinition.format_chars:
+                    self._add_tok(tokens, word, UnresolvedTokenTypes.Word)
+                    self._add_tok(tokens, c, UnresolvedTokenTypes.Format_Character)
+                    word = ''
+
+                # begin of number
+                elif c.isdigit() and not is_number and word == '':
+                    word += c
+                    is_number = True
+
+                # continue number
+                elif (c.isdigit() or c == '.') and is_number:
+                    word += c
+
+                # begin expression char
+                elif c in LanguageDefinition.expr_chars and not is_expr_char:
+                    word += c
+                    is_expr_char = True
+
+                # continue expression char
+                elif c in LanguageDefinition.expr_chars and is_expr_char:
+                    word += c
+
+                # bool expression char
+                elif c in LanguageDefinition.bool_expr_chars:
+                    self._add_tok(tokens, word, UnresolvedTokenTypes.Word)
+                    self._add_tok(tokens, c, UnresolvedTokenTypes.Bool_Expression_Character)
+                    word = ''
+
+                # end of word
+                elif c == ' ' and not is_string1 and not is_string2 or c == '\n':
+                    self._add_tok(tokens, word, UnresolvedTokenTypes.Word)
+                    word = ''
+
+                else:
+                    word += c
+
+            if c == '\n' and ol_comment:
+                ol_comment = False
+
+            if line[i - 1] == '*' and c == '/':
+                self._is_ml_comment = False
+
+        return tokens