DSM/tokenizer.py

59 lines
2.2 KiB
Python

import re
import json
class TokenizerError(Exception):
pass
class Tokenizer:
def __init__(self):
self.reserved = [
'READ', 'WRITE', 'IF', 'THEN', 'ELSEIF', 'ELSE', 'ENDIF', 'FOR',
'IN', 'DO', 'ENDDO', 'NOW', 'CURTIME', 'MINIMUM', 'MAXIMUM',
'FIRST', 'LAST', 'SUM', 'AVERAGE', 'EARLIEST', 'LATEST', 'COS', 'SIN', 'TIME', 'OF', 'TRACE', 'TRUE', 'FALSE', 'IS', 'LIST', 'NUMBER', 'NOT', 'COUNT',
'WHERE', 'IT', 'THEY', 'AND', 'OR', 'LESS', 'GREATER', 'THAN', 'TO', 'WITHIN', 'EQUAL', 'OCCURS', 'OCCURRED', 'AFTER', 'BEFORE'
]
self.regex_patterns = {
"STRTOKEN": r'(".*?")',
"TIMETOKEN": r'\d{4}-\d{2}-\d{2}(T\d{2}:\d{2}:\d{2})?',
"COMMENT": r'//.*$',
"SEMICOLON": r';',
"COMMA": r',',
"ASSIGN": r':=',
"PLUS": r'\+',
"MINUS": r'\-',
"POWER": r'\*\*',
"TIMES": r'\*',
"DIVIDE": r'\/',
"LPAR": r'\(',
"RPAR": r'\)',
"LSPAR": r'\[',
"RSPAR": r'\]',
"AMBERSAND": r'&',
"NEQ": r'<>',
"LT": r'<',
"GT": r'>',
"LTEQ": r'<=',
"GTEQ": r'>=',
"EQ": r'=',
"RANGE": r'\.\.\.'
}
self.regex_patterns |= {
"NULLTOKEN": r'\bNULL\b'
}
self.regex_patterns |= {i: rf'\b{i}\b' for i in self.reserved}
self.regex_patterns |= {
"NUMTOKEN": r'\d+(\.\d*)?',
"IDENTIFIER": r'\w+'
}
self.combined_pattern = '|'.join(f'(?P<{name}>{pattern})' for name, pattern in self.regex_patterns.items())
self.regex = re.compile(self.combined_pattern, re.IGNORECASE)
def tokenize(self, input_file):
tokens = []
with open(input_file, 'r') as file:
for i, line in enumerate(file):
for match in self.regex.finditer(line):
token_type = match.lastgroup
token_value = match.group(token_type)
if token_type == "STRTOKEN":
token_value = token_value.replace('"','')
tokens.append([str(i + 1), token_type, token_value])
return tokens