Added lexer

This commit is contained in:
2021-08-14 16:41:25 +02:00
parent 4a3f9b4b9d
commit c909ce31d8
18 changed files with 438 additions and 210 deletions

View File

@@ -1,7 +1,14 @@
from abc import ABC, abstractmethod
from cpl_query.extension.list import List
from lexer.model.token import Token
class LexerABC(ABC):
@abstractmethod
def __init__(self): pass
@abstractmethod
def tokenize(self, line: str) -> List[Token]: pass

View File

20
src/lexer/model/token.py Normal file
View File

@@ -0,0 +1,20 @@
from lexer.model.token_types import TokenTypes
class Token:
def __init__(self, token_type: TokenTypes, value: str) -> None:
self._type: TokenTypes = token_type
self._value: str = value
@property
def type(self) -> TokenTypes:
return self._type
@property
def value(self) -> str:
return self._value
@value.setter
def value(self, value: str):
self._value = value

View File

@@ -0,0 +1,24 @@
from enum import Enum
class TokenTypes(Enum):
Empty = 0
Keyword = 1
Type = 2
Name = 3
Bool = 4
String = 5
Number = 6
Expression_Character = 7
Bool_Expression_Character = 8
Format_Character = 9
class UnresolvedTokenTypes(Enum):
Empty = 0
Word = 1
Number = 2
String = 3
Expression_Character = 4
Bool_Expression_Character = 5
Format_Character = 6

View File

@@ -0,0 +1,62 @@
from enum import Enum
class Keywords(Enum):
# define keywords
Library = 'lib'
Class = 'class'
Function = 'func'
Variable = 'var'
Use = 'use'
From = 'from'
# builtin functions
Output = 'output'
Input = 'input'
Length = 'length'
Range = 'range'
Exit = 'exit'
# normal keywords
If = 'if'
ElseIf = 'elseif'
Else = 'else'
Continue = 'continue'
In = 'in'
Return = 'return'
# loops
While = 'while'
For = 'for'
# access
Public = 'public'
This = 'this'
class Booleans(Enum):
Right = 'true'
Wrong = 'false'
class ExpressionCharacters(Enum):
Plus = '+'
Minus = '-'
Asterisk = '*'
Slash = '/'
Equal = '='
Caret = '^'
class FormatCharacters(Enum):
Left_Brace = '{'
Right_Brace = '}'
Left_Parenthesis = '('
Right_Parenthesis = ')'
Left_Bracket = '['
Right_Bracket = ']'
Semicolon = ';'
Colon = ':'
Comma = ','
Point = '.'

View File

@@ -1,7 +1,166 @@
from cpl_query.extension.list import List
from cc_lang.model.language_definition import LanguageDefinition
from lexer.abc.lexer_abc import LexerABC
from lexer.model.token import Token
from lexer.model.token_types import UnresolvedTokenTypes, TokenTypes
from runtime.abc.runtime_service_abc import RuntimeServiceABC
class LexerService(LexerABC):
def __init__(self):
pass
def __init__(self, runtime: RuntimeServiceABC):
self._runtime = runtime
self._is_ml_comment = False
def _add_tok(self, tokens: List[Token], value: str, input_token_type: UnresolvedTokenTypes) -> None:
"""
Creates token object
:param value:
:param input_token_type:
:return:
"""
token_type: TokenTypes = TokenTypes.Empty
if value != '':
if input_token_type == UnresolvedTokenTypes.Word:
if value in LanguageDefinition.keywords:
token_type = TokenTypes.Keyword
elif value in LanguageDefinition.datatypes:
token_type = TokenTypes.Type
elif value in LanguageDefinition.bool_values:
token_type = TokenTypes.Bool
elif value == UnresolvedTokenTypes.Empty:
token_type = TokenTypes.Empty
else:
token_type = TokenTypes.Name
elif input_token_type == UnresolvedTokenTypes.Number:
token_type = TokenTypes.Number
elif input_token_type == UnresolvedTokenTypes.String:
token_type = TokenTypes.String
elif input_token_type == UnresolvedTokenTypes.Expression_Character:
token_type = TokenTypes.Expression_Character
elif input_token_type == UnresolvedTokenTypes.Bool_Expression_Character:
token_type = TokenTypes.Bool_Expression_Character
elif input_token_type == UnresolvedTokenTypes.Format_Character:
token_type = TokenTypes.Format_Character
tokens.append(Token(token_type, value))
def tokenize(self, line: str) -> List[Token]:
tokens: List[Token] = List(Token)
word = ''
ol_comment = False
is_string1 = False # 'hello'
is_string2 = False # "hello"
is_number = False
is_expr_char = False
for i in range(0, len(line)):
c = line[i]
# ignore comments and spaces
if not ol_comment and not self._is_ml_comment:
# comment filtering
if c == '#' and not is_string1 and not is_string2:
ol_comment = True
elif line[i - 1] == '/' and c == '/':
ol_comment = True
elif line[i - 1] == '/' and c == '*':
self._is_ml_comment = True
i += 2
# end of number
elif not c.isdigit() and c != '.' and is_number:
self._add_tok(tokens, word, UnresolvedTokenTypes.Number)
local_tokens = self.tokenize(c)
for local_token in local_tokens:
tokens.append(local_token)
word = ''
is_number = False
# end of expression char
elif c not in LanguageDefinition.expr_chars and is_expr_char:
self._add_tok(tokens, word, UnresolvedTokenTypes.Expression_Character)
word = ''
is_expr_char = False
# begin of is_string1
elif c == '\'' and not is_string1:
is_string1 = True
word = ''
# end of is_string1
elif c == '\'' and is_string1:
is_string1 = False
self._add_tok(tokens, word, UnresolvedTokenTypes.String)
word = ''
# begin of is_string2
elif c == '\"' and not is_string2:
is_string2 = True
word = ''
# end of is_string2
elif c == '\"' and is_string2:
is_string2 = False
self._add_tok(tokens, word, UnresolvedTokenTypes.String)
word = ''
# format char
elif c in LanguageDefinition.format_chars:
self._add_tok(tokens, word, UnresolvedTokenTypes.Word)
self._add_tok(tokens, c, UnresolvedTokenTypes.Format_Character)
word = ''
# begin of number
elif c.isdigit() and not is_number and word == '':
word += c
is_number = True
# continue number
elif (c.isdigit() or c == '.') and is_number:
word += c
# begin expression char
elif c in LanguageDefinition.expr_chars and not is_expr_char:
word += c
is_expr_char = True
# continue expression char
elif c in LanguageDefinition.expr_chars and is_expr_char:
word += c
# bool expression char
elif c in LanguageDefinition.bool_expr_chars:
self._add_tok(tokens, word, UnresolvedTokenTypes.Word)
self._add_tok(tokens, c, UnresolvedTokenTypes.Bool_Expression_Character)
word = ''
# end of word
elif c == ' ' and not is_string1 and not is_string2 or c == '\n':
self._add_tok(tokens, word, UnresolvedTokenTypes.Word)
word = ''
else:
word += c
if c == '\n' and ol_comment:
ol_comment = False
if line[i - 1] == '*' and c == '/':
self._is_ml_comment = False
return tokens