import re import json class TokenizerError(Exception): pass class Tokenizer: def __init__(self): self.reserved = [ 'READ', 'WRITE', 'IF', 'THEN', 'ELSEIF', 'ELSE', 'ENDIF', 'FOR', 'IN', 'DO', 'ENDDO', 'NOW', 'CURTIME', 'MINIMUM', 'MAXIMUM', 'FIRST', 'LAST', 'SUM', 'AVERAGE', 'EARLIEST', 'LATEST', 'COS', 'SIN', 'TIME', 'OF', 'TRACE', 'TRUE', 'FALSE', 'IS', 'LIST', 'NUMBER', 'NOT', 'COUNT', 'WHERE', 'IT', 'THEY', 'AND', 'OR', 'LESS', 'GREATER', 'THAN', 'TO', 'WITHIN', 'EQUAL', 'OCCURS', 'OCCURRED', 'AFTER', 'BEFORE' ] self.regex_patterns = { "STRTOKEN": r'(".*?")', "TIMETOKEN": r'\d{4}-\d{2}-\d{2}(T\d{2}:\d{2}:\d{2})?', "COMMENT": r'//.*$', "SEMICOLON": r';', "COMMA": r',', "ASSIGN": r':=', "PLUS": r'\+', "MINUS": r'\-', "POWER": r'\*\*', "TIMES": r'\*', "DIVIDE": r'\/', "LPAR": r'\(', "RPAR": r'\)', "LSPAR": r'\[', "RSPAR": r'\]', "AMBERSAND": r'&', "NEQ": r'<>', "LT": r'<', "GT": r'>', "LTEQ": r'<=', "GTEQ": r'>=', "EQ": r'=', "RANGE": r'\.\.\.' } self.regex_patterns |= { "NULLTOKEN": r'\bNULL\b' } self.regex_patterns |= {i: rf'\b{i}\b' for i in self.reserved} self.regex_patterns |= { "NUMTOKEN": r'\d+(\.\d*)?', "IDENTIFIER": r'\w+' } self.combined_pattern = '|'.join(f'(?P<{name}>{pattern})' for name, pattern in self.regex_patterns.items()) self.regex = re.compile(self.combined_pattern, re.IGNORECASE) def tokenize(self, input_file): tokens = [] with open(input_file, 'r') as file: for i, line in enumerate(file): for match in self.regex.finditer(line): token_type = match.lastgroup token_value = match.group(token_type) if token_type == "STRTOKEN": token_value = token_value.replace('"','') tokens.append([str(i + 1), token_type, token_value]) return tokens