def normalize_entity(entity_str): if not entity_str: return "" normalized = entity_str.replace("\n", " ") normalized = "".join(normalized.lower().split()) return normalized def merge_entities(spacy_data, exxeta_data): merged = [] seen = set() # Process SpaCy entities first for s in spacy_data: s_entity_norm = normalize_entity(s["entity"]) s_page = s["page"] # Look for matching Exxeta entities found = False for e in exxeta_data: e_entity_norm = normalize_entity(e["entity"]) e_page = e["page"] # Match if normalized entity and page match if ( s["label"] == e["label"] and s_entity_norm == e_entity_norm and s_page == e_page ): merged.append( { "label": s["label"], "entity": s["entity"], "page": s_page, "status": "validated", } ) seen.add((e["entity"], e_page)) found = True break # If no match found, add as single-source if not found: merged.append( { "label": s["label"], "entity": s["entity"], "page": s_page, "status": "single-source", "source": "spacy", } ) # Add remaining Exxeta entities not already processed for e in exxeta_data: if (e["entity"], e["page"]) not in seen: merged.append( { "label": e["label"], "entity": e["entity"], "page": e["page"], "status": "single-source", "source": "exxeta", } ) return merged