59 lines
2.2 KiB
Python
59 lines
2.2 KiB
Python
import re
|
|
import json
|
|
class TokenizerError(Exception):
|
|
pass
|
|
class Tokenizer:
|
|
def __init__(self):
|
|
self.reserved = [
|
|
'READ', 'WRITE', 'IF', 'THEN', 'ELSEIF', 'ELSE', 'ENDIF', 'FOR',
|
|
'IN', 'DO', 'ENDDO', 'NOW', 'CURTIME', 'MINIMUM', 'MAXIMUM',
|
|
'FIRST', 'LAST', 'SUM', 'AVERAGE', 'EARLIEST', 'LATEST', 'COS', 'SIN', 'TIME', 'OF', 'TRACE', 'TRUE', 'FALSE', 'IS', 'LIST', 'NUMBER', 'NOT', 'COUNT',
|
|
'WHERE', 'IT', 'THEY', 'AND', 'OR', 'LESS', 'GREATER', 'THAN', 'TO', 'WITHIN', 'EQUAL', 'OCCURS', 'OCCURRED', 'AFTER', 'BEFORE'
|
|
]
|
|
self.regex_patterns = {
|
|
"STRTOKEN": r'(".*?")',
|
|
"TIMETOKEN": r'\d{4}-\d{2}-\d{2}(T\d{2}:\d{2}:\d{2})?',
|
|
"COMMENT": r'//.*$',
|
|
"SEMICOLON": r';',
|
|
"COMMA": r',',
|
|
"ASSIGN": r':=',
|
|
"PLUS": r'\+',
|
|
"MINUS": r'\-',
|
|
"POWER": r'\*\*',
|
|
"TIMES": r'\*',
|
|
"DIVIDE": r'\/',
|
|
"LPAR": r'\(',
|
|
"RPAR": r'\)',
|
|
"LSPAR": r'\[',
|
|
"RSPAR": r'\]',
|
|
"AMBERSAND": r'&',
|
|
"NEQ": r'<>',
|
|
"LT": r'<',
|
|
"GT": r'>',
|
|
"LTEQ": r'<=',
|
|
"GTEQ": r'>=',
|
|
"EQ": r'=',
|
|
"RANGE": r'\.\.\.'
|
|
}
|
|
self.regex_patterns |= {
|
|
"NULLTOKEN": r'\bNULL\b'
|
|
}
|
|
self.regex_patterns |= {i: rf'\b{i}\b' for i in self.reserved}
|
|
self.regex_patterns |= {
|
|
"NUMTOKEN": r'\d+(\.\d*)?',
|
|
"IDENTIFIER": r'\w+'
|
|
}
|
|
self.combined_pattern = '|'.join(f'(?P<{name}>{pattern})' for name, pattern in self.regex_patterns.items())
|
|
self.regex = re.compile(self.combined_pattern, re.IGNORECASE)
|
|
|
|
def tokenize(self, input_file):
|
|
tokens = []
|
|
with open(input_file, 'r') as file:
|
|
for i, line in enumerate(file):
|
|
for match in self.regex.finditer(line):
|
|
token_type = match.lastgroup
|
|
token_value = match.group(token_type)
|
|
if token_type == "STRTOKEN":
|
|
token_value = token_value.replace('"','')
|
|
tokens.append([str(i + 1), token_type, token_value])
|
|
return tokens |