from typing import Dict, List import re import requests import os # SETTINGS = [{"id": "Rendite", "type": "number"}] COORDINATOR_URL = os.getenv("COORDINATOR_URL", "http://localhost:5000") def validate_entities(entities): try: response = requests.get(COORDINATOR_URL + "/api/kpi_setting/") if response.status_code == 200: settings = response.json() else: settings = [] except requests.exceptions.RequestException as e: print(f"Error fetching settings: {e}") settings = [] # settings = SETTINGS result = [] reduced_kpi: Dict[str, List[Dict[str, str]]] = {} # reduce entities by label. Example: {"PERSON": [{"label": "PERSON", "entity": "John Doe", "status": "validated"}]} for item in entities: label = item["label"] if label not in reduced_kpi: reduced_kpi[label] = [] reduced_kpi[label].append(item) reduced_kpi = delete_exxeta_unknown(reduced_kpi) reduced_kpi = validate_number(reduced_kpi, settings) reduced_kpi = delete_duplicate_entities(reduced_kpi) for item in reduced_kpi.items(): if item[0] == "FONDSNAME": result.extend(item[1]) continue elif item[0] == "DATUM": result.extend(item[1]) continue elif item[0] == "FONDSMANAGER": result.extend(item[1]) continue # Filter not validated, if there are valid values validated = False for entity in item[1]: if entity["status"] == "validated": validated = True if validated: item_list = [x for x in item[1] if x["status"] == "validated"] result.extend(item_list) else: result.extend(item[1]) return result def validate_number(entity_list, settings): filtered_kpi = {} for label, entity_list in entity_list.items(): setting = next((s for s in settings if s["name"].upper() == label), None) if setting and setting["type"] == "number": filtered_entities = [ entity for entity in entity_list if is_valid_number(str(entity["entity"])) ] for entity in entity_list: if not is_valid_number(str(entity["entity"])): print(f"Invalid number: {entity}") if filtered_entities: # Only add the label if there are entities left filtered_kpi[label] = filtered_entities else: filtered_kpi[label] = entity_list return filtered_kpi def is_valid_number(number): pattern = r"^[0-9\-\s%,.€]+$" return ( any(char.isdigit() for char in number) and not re.search(r"\d+\s\d+", number) and re.fullmatch(pattern, number) ) def delete_exxeta_unknown(entity_list): filtered_kpi = {} for label, entity_list in entity_list.items(): # Filter out entities with "nichtangegeben" or "n/a" (case-insensitive and stripped) filtered_entities = [ entity for entity in entity_list if str(entity["entity"]).lower().replace(" ", "") not in {"nichtangegeben", "n/a"} ] for entity in entity_list: if str(entity["entity"]).lower().replace(" ", "") in { "nichtangegeben", "n/a", }: print(f"filtered out: {entity}") if filtered_entities: # Only add the label if there are entities left filtered_kpi[label] = filtered_entities return filtered_kpi def delete_duplicate_entities(entity_list): unique_entities = {} for label, entity_list in entity_list.items(): values = set() filtered_entities = [] for entity in entity_list: if str(entity["entity"]).lower().replace(" ", "") not in values: filtered_entities.append(entity) else: print(f"Duplicate entity: {entity}") values.add(str(entity["entity"]).lower().replace(" ", "")) if filtered_entities: unique_entities[label] = filtered_entities return unique_entities if __name__ == "__main__": entities = [ # {"label": "PERSON", "entity": "John Doe", "status": "validated"}, # {"label": "PERSON", "entity": "Exxeta", "status": "invalid"}, # {"label": "ORG", "entity": "Google", "status": "invalid"}, # {"label": "FONDSNAME", "entity": "Microsoft", "status": "validated"}, # {"label": "FONDSNAME", "entity": "Amazon", "status": "invalid"}, # {"label": "FONDSNAME", "entity": "Apple", "status": "invalid"}, {"label": "RENDITE", "entity": "8 8 8 8 8", "status": "validated"}, {"label": "RENDITE", "entity": "N/A", "status": "validated"}, {"label": "RENDITE", "entity": "nicht angegeben", "status": "validated"}, {"label": "RENDITE", "entity": "uaieluae--t>", "status": "validated"}, {"label": "RENDITE", "entity": "3,5", "status": "validated"}, {"label": "RENDITE", "entity": "3,5", "status": "validated"}, {"label": "RENDITE", "entity": "3 , 5", "status": "validated"}, {"label": "RENDITE", "entity": "3%", "status": "validated"}, {"label": "RENDITE", "entity": "", "status": "invalid"}, {"label": "RENDITE", "entity": "2 mehr als 6", "status": "invalid"}, {"label": "RENDITE", "entity": 2, "status": "invalid"}, ] print(validate_entities(entities))