68 lines
1.9 KiB
Python
68 lines
1.9 KiB
Python
def normalize_entity(entity_str):
|
|
if not entity_str:
|
|
return ""
|
|
normalized = entity_str.replace("\n", " ")
|
|
normalized = "".join(normalized.lower().split())
|
|
return normalized
|
|
|
|
|
|
def merge_entities(spacy_data, exxeta_data):
|
|
merged = []
|
|
seen = set()
|
|
|
|
# Process SpaCy entities first
|
|
for s in spacy_data:
|
|
s_entity_norm = normalize_entity(s["entity"])
|
|
s_page = s["page"]
|
|
|
|
# Look for matching Exxeta entities
|
|
found = False
|
|
for e in exxeta_data:
|
|
e_entity_norm = normalize_entity(e["entity"])
|
|
e_page = e["page"]
|
|
|
|
# Match if normalized entity and page match
|
|
if (
|
|
s["label"] == e["label"]
|
|
and s_entity_norm == e_entity_norm
|
|
and s_page == e_page
|
|
):
|
|
merged.append(
|
|
{
|
|
"label": s["label"],
|
|
"entity": s["entity"],
|
|
"page": s_page,
|
|
"status": "validated",
|
|
}
|
|
)
|
|
seen.add((e["entity"], e_page))
|
|
found = True
|
|
break
|
|
|
|
# If no match found, add as single-source
|
|
if not found:
|
|
merged.append(
|
|
{
|
|
"label": s["label"],
|
|
"entity": s["entity"],
|
|
"page": s_page,
|
|
"status": "single-source",
|
|
"source": "spacy",
|
|
}
|
|
)
|
|
|
|
# Add remaining Exxeta entities not already processed
|
|
for e in exxeta_data:
|
|
if (e["entity"], e["page"]) not in seen:
|
|
merged.append(
|
|
{
|
|
"label": e["label"],
|
|
"entity": e["entity"],
|
|
"page": e["page"],
|
|
"status": "single-source",
|
|
"source": "exxeta",
|
|
}
|
|
)
|
|
|
|
return merged
|