Merge branch 'main' into #15-spacy-finetuning

pull/49/head
Zainab MohamedBasheer 2025-06-02 22:55:01 +02:00
commit 4fcfcb856e
15 changed files with 557 additions and 11 deletions

View File

@ -2,3 +2,4 @@ API_KEY=
DATABASE_URL=postgresql://admin:admin@db:5432 DATABASE_URL=postgresql://admin:admin@db:5432
POSTGRES_PASSWORD=admin POSTGRES_PASSWORD=admin
POSTGRES_USER=admin POSTGRES_USER=admin
COORDINATOR_URL="coordinator:5000"

View File

@ -18,4 +18,4 @@ COPY . .
ENV PYTHONUNBUFFERED=1 ENV PYTHONUNBUFFERED=1
EXPOSE 5000 EXPOSE 5000
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "app:app"] CMD ["gunicorn","--timeout", "10","--workers", "2", "--bind", "0.0.0.0:5000", "app:app"]

View File

@ -42,7 +42,7 @@ def extract_with_exxeta(pages_json):
" },\n" " },\n"
" ...\n" " ...\n"
"]\n\n" "]\n\n"
"Nur JSON-Antwort keine Kommentare, keine Erklärungen.\n\n" "Nur JSON-Antwort - keine Kommentare, keine Erklärungen.\n\n"
f"TEXT:\n{text}" f"TEXT:\n{text}"
) )
else: else:
@ -68,7 +68,7 @@ def extract_with_exxeta(pages_json):
"- LÄNDERALLOKATION (z. B. DEUTSCHLAND, FRANKREICH, etc. inkl. %-Angaben)\n\n" "- LÄNDERALLOKATION (z. B. DEUTSCHLAND, FRANKREICH, etc. inkl. %-Angaben)\n\n"
"WICHTIG:\n" "WICHTIG:\n"
"- Gib **nur eine Entität pro Kennzahl** an keine Listen oder Interpretationen.\n" "- Gib **nur eine Entität pro Kennzahl** an - keine Listen oder Interpretationen.\n"
"- Wenn mehrere Varianten genannt werden (z. B. \"Core und Core+\"), gib sie im Originalformat als **eine entity** an.\n" "- Wenn mehrere Varianten genannt werden (z. B. \"Core und Core+\"), gib sie im Originalformat als **eine entity** an.\n"
"- **Keine Vermutungen oder Ergänzungen**. Wenn keine Information enthalten ist, gib die Kennzahl **nicht aus**.\n" "- **Keine Vermutungen oder Ergänzungen**. Wenn keine Information enthalten ist, gib die Kennzahl **nicht aus**.\n"
"- Extrahiere **nur wörtlich vorkommende Inhalte** (keine Berechnungen, keine Zusammenfassungen).\n" "- Extrahiere **nur wörtlich vorkommende Inhalte** (keine Berechnungen, keine Zusammenfassungen).\n"
@ -86,7 +86,7 @@ def extract_with_exxeta(pages_json):
"]\n\n" "]\n\n"
f"Falls keine Kennzahl enthalten ist, gib ein leeres Array [] zurück.\n\n" f"Falls keine Kennzahl enthalten ist, gib ein leeres Array [] zurück.\n\n"
f"Nur JSON-Antwort keine Kommentare, keine Erklärungen, kein Text außerhalb des JSON.\n\n" f"Nur JSON-Antwort - keine Kommentare, keine Erklärungen, kein Text außerhalb des JSON.\n\n"
f"TEXT:\n{text}" f"TEXT:\n{text}"
) )

View File

@ -0,0 +1 @@
COORDINATOR_URL=""

View File

@ -0,0 +1,97 @@
# Validate Service
Ein Flask-basierter Microservice zur asynchronen Verarbeitung und Validierung von Entitäten aus zwei verschiedenen NLP-Services (SpaCy und Exxeta).
## Funktionsweise
Das Service empfängt für jede eindeutige ID zwei POST-Requests von verschiedenen Services:
1. **SpaCy Service** - sendet extrahierte Entitäten
2. **Exxeta Service** - sendet extrahierte Entitäten
Beim ersten Request werden die Daten zwischengespeichert. Beim zweiten Request startet die asynchrone Verarbeitung. Nach der Verarbeitung werden die Ergebnisse an einen nachgelagerten Service weitergeleitet.
## API Endpoints
### POST /validate
Empfängt Entitätsdaten von SpaCy oder Exxeta Services.
**Request Body:**
```json
{
"id": "pitch_book_id",
"service": "spacy|exxeta",
"entities": [
{
"label": "PERSON",
"entity": "Max Mustermann",
"page": 1
}
]
}
```
**Response:**
- **200**: Daten erfolgreich verarbeitet
- **400**: Fehlende oder ungültige Parameter
- **500**: Serverfehler
## Installation und Start
1. **Abhängigkeiten installieren:**
```bash
pip install -r requirements.txt
```
2. **Service starten:**
```bash
python app.py
```
Das Service läuft standardmäßig auf `http://localhost:5050`
## Konfiguration
Umgebungsvariablen in `config.py`:
- `COORDINATOR_URL`: URL des Koordinators
## Verarbeitungslogik
1. **Zwischenspeicherung**: Beim ersten Request wird das JSON in einem Thread-sicheren Dictionary gespeichert
2. **Trigger**: Beim zweiten Request wird die asynchrone Verarbeitung gestartet
3. **Merge & Validate**: Die `merge_and_validate_entities` Funktion führt die Validierung durch:
- Normalisiert Entitäten (entfernt Zeilenumbrüche, konvertiert zu lowercase)
- Matched Entitäten basierend auf Label, normalisiertem Text und Seitenzahl
- Kennzeichnet Entitäten als "validated" (beide Services) oder "single-source"
4. **Weiterleitung**: Ergebnisse werden an den nächsten Service gesendet
5. **Cleanup**: Verarbeitete Daten werden aus dem Speicher entfernt
## Architektur
```
┌─────────────────┐ ┌─────────────────┐
│ SpaCy Service │ │ Exxeta Service │
└─────────┬───────┘ └─────────┬───────┘
│ │
│ POST /validate │ POST /validate
│ (service_type:spacy) │ (service_type:exxeta)
▼ ▼
┌─────────────────────────────────────┐
│ Validate Service │
│ ┌─────────────────────────────┐ │
│ │ Zwischenspeicher │ │
│ │ (Thread-safe Dictionary) │ │
│ └─────────────────────────────┘ │
│ ┌─────────────────────────────┐ │
│ │ Asynchrone Verarbeitung │ │
│ │ (merge_and_validate_entities)│ │
│ └─────────────────────────────┘ │
└─────────────┬───────────────────────┘
│ POST (processed data)
┌─────────────────────────────┐
│ Nachgelagerter Service │
└─────────────────────────────┘
```

View File

@ -0,0 +1,130 @@
from flask import Flask, request, jsonify
import threading
from merge_logic import merge_entities
from validate_logic import validate_entities
from dotenv import load_dotenv
import os
import requests
import json
app = Flask(__name__)
load_dotenv()
coordinator_url = os.getenv("COORDINATOR_URL") or ""
# todo add persistence layer
data_storage = {} # {id: {spacy_data: [], exxeta_data: []}}
storage_lock = threading.Lock()
def send_to_coordinator_service(processed_data, request_id):
if not coordinator_url:
print("Not processed, missing url", processed_data)
return
try:
payload = {
"kpi": json.dumps(processed_data),
}
requests.put(
"http://" + coordinator_url + "/api/pitch_book/" + str(request_id),
data=payload,
)
print(f"Result PitchBook {request_id} sent to coordinator")
except Exception as e:
print(f"Error sending ID {request_id}: {e}")
def process_data_async(request_id, spacy_data, exxeta_data):
try:
print(f"Start asynchronous processing for PitchBook: {request_id}")
# Perform merge
merged_entities = merge_entities(spacy_data, exxeta_data)
valid_entities = validate_entities(merged_entities)
# Send result to next service
send_to_coordinator_service(valid_entities, request_id)
# Remove processed data from storage
with storage_lock:
if request_id in data_storage:
del data_storage[request_id]
except Exception as e:
print(f"Error during asynchronous processing for ID {request_id}: {e}")
@app.route("/validate", methods=["POST"])
def validate():
try:
json_data = request.get_json()
if not json_data:
return jsonify({"error": "Missing JSON data"}), 400
# extract ID and service_type from the data
request_id = json_data.get("id")
service_type = json_data.get("service") # 'spacy' or 'exxeta'
entities = json_data.get("entities", [])
if not request_id or not service_type:
return jsonify({"error": "ID and service_type are required"}), 400
if service_type not in ["spacy", "exxeta"]:
return jsonify({"error": "service_type has to be 'spacy' or 'exxeta'"}), 400
with storage_lock:
# Initialize entry if not already present
if request_id not in data_storage:
data_storage[request_id] = {
"spacy_data": None,
"exxeta_data": None,
}
# Store the data based on the service type
data_storage[request_id][f"{service_type}_data"] = entities
# Check if both datasets are present
stored_data = data_storage[request_id]
spacy_data = stored_data["spacy_data"]
exxeta_data = stored_data["exxeta_data"]
# If both datasets are present, start asynchronous processing
if spacy_data is not None and exxeta_data is not None:
# Start asynchronous processing in a separate thread
processing_thread = threading.Thread(
target=process_data_async,
args=(request_id, spacy_data, exxeta_data),
daemon=True,
)
processing_thread.start()
return (
jsonify(
{
"message": f"Second dataset for ID {request_id} received. Processing started.",
}
),
200,
)
else:
return (
jsonify(
{
"message": f"First dataset for ID {request_id} from {service_type} stored. Waiting for second dataset.",
}
),
200,
)
except Exception as e:
print(f"Error occurred: {str(e)}")
return jsonify({"error": f"Fehler: {str(e)}"}), 500
if __name__ == "__main__":
app.run(debug=True, host="0.0.0.0", port=5050)

View File

@ -0,0 +1,68 @@
def normalize_entity(entity_str):
if not entity_str:
return ""
normalized = entity_str.replace("\n", " ")
normalized = "".join(normalized.lower().split())
return normalized
def merge_entities(spacy_data, exxeta_data):
merged = []
seen = set()
# Process SpaCy entities first
for s in spacy_data:
s_entity_norm = normalize_entity(s["entity"])
s_page = s["page"]
# Look for matching Exxeta entities
found = False
for e in exxeta_data:
e_entity_norm = normalize_entity(e["entity"])
e_page = e["page"]
# Match if normalized entity and page match
if (
s["label"] == e["label"]
and s_entity_norm == e_entity_norm
and s_page == e_page
):
merged.append(
{
"label": s["label"],
"entity": s["entity"],
"page": s_page,
"status": "validated",
}
)
seen.add((e["entity"], e_page))
found = True
break
# If no match found, add as single-source
if not found:
merged.append(
{
"label": s["label"],
"entity": s["entity"],
"page": s_page,
"status": "single-source",
"source": "spacy",
}
)
# Add remaining Exxeta entities not already processed
for e in exxeta_data:
if (e["entity"], e["page"]) not in seen:
merged.append(
{
"label": e["label"],
"entity": e["entity"],
"page": e["page"],
"status": "single-source",
"source": "exxeta",
}
)
return merged

View File

@ -0,0 +1,14 @@
blinker==1.9.0
certifi==2025.4.26
charset-normalizer==3.4.2
click==8.2.1
dotenv==0.9.9
Flask==3.1.1
idna==3.10
itsdangerous==2.2.0
Jinja2==3.1.6
MarkupSafe==3.0.2
python-dotenv==1.1.0
requests==2.32.3
urllib3==2.4.0
Werkzeug==3.1.3

View File

@ -0,0 +1,12 @@
def validate_entities(entities):
return entities
#todo
valid = []
for entity in entities:
if entity["type"] == "PERSON":
if entity["name"] == "John Doe":
valid.append(entity)
elif entity["type"] == "ORG":
if entity["name"] == "Exxeta":
valid.append(entity)
return valid

View File

@ -30,7 +30,7 @@ services:
timeout: 5s timeout: 5s
retries: 5 retries: 5
ports: ports:
- 5000:5000 - 5050:5000
spacy: spacy:
build: build:
@ -42,3 +42,11 @@ services:
dockerfile: ../../Dockerfile dockerfile: ../../Dockerfile
env_file: env_file:
- .env - .env
validate:
build:
context: backend/validate-service
dockerfile: ../../Dockerfile
env_file:
- .env
ports:
- 5051:5000

Binary file not shown.

View File

@ -26,7 +26,7 @@ export default function UploadPage() {
px={2} px={2}
> >
<IconButton onClick={() => navigate({ to: '/config' })}> <IconButton onClick={() => navigate({ to: '/config' })}>
<SettingsIcon fontSize="large" /> <SettingsIcon fontSize="large"/>
</IconButton> </IconButton>
</Box> </Box>
<Paper <Paper
@ -91,7 +91,7 @@ export default function UploadPage() {
backgroundColor: '#383838', backgroundColor: '#383838',
}} }}
disabled={files.length === 0} disabled={files.length === 0}
onClick={() => alert('Kein Backend, aber Button klickbar')} onClick={() => navigate({ to: '/extractedResult' })}
> >
Kennzahlen extrahieren Kennzahlen extrahieren
</Button> </Button>

View File

@ -0,0 +1,91 @@
import { Document, Page, pdfjs } from "react-pdf";
import { useState, useRef, useEffect } from 'react';
import 'react-pdf/dist/esm/Page/AnnotationLayer.css';
import 'react-pdf/dist/esm/Page/TextLayer.css';
import { Box, IconButton } from '@mui/material';
import ArrowCircleLeftIcon from '@mui/icons-material/ArrowCircleLeft';
import ArrowCircleRightIcon from '@mui/icons-material/ArrowCircleRight';
import testPDF from '/example.pdf';
pdfjs.GlobalWorkerOptions.workerSrc = new URL(
"pdfjs-dist/build/pdf.worker.min.mjs",
import.meta.url,
).toString();
export default function PDFViewer() {
const [numPages, setNumPages] = useState<number | null>(null);
const [pageNumber, setPageNumber] = useState(1);
const [containerWidth, setContainerWidth] = useState<number | null>(null);
const containerRef = useRef<HTMLDivElement>(null);
const onDocumentLoadSuccess = ({ numPages }: { numPages: number }) => {
setNumPages(numPages);
};
useEffect(() => {
const updateWidth = () => {
if (containerRef.current) {
setContainerWidth(containerRef.current.offsetWidth);
}
};
updateWidth();
window.addEventListener('resize', updateWidth);
return () => window.removeEventListener('resize', updateWidth);
}, []);
return (
<Box
display="flex"
flexDirection="column"
justifyContent="center"
alignItems="center"
width="100%"
height="100%"
p={2}
>
<Box
ref={containerRef}
sx={{
width: '100%',
maxHeight: '90vh',
overflow: 'auto',
display: 'flex',
justifyContent: 'center',
alignItems: 'center',
}}
>
<Document file={testPDF}
onLoadSuccess={onDocumentLoadSuccess}
onLoadError={(error) => console.error('Es gab ein Fehler beim Laden des PDFs:', error)}
onSourceError={(error) => console.error('Ungültige PDF:', error)}>
{containerWidth && (
<Page
pageNumber={pageNumber}
width={containerWidth * 0.8}
/>
)}
</Document>
</Box>
<Box
mt={2}
display="flex"
alignItems="center"
justifyContent="center"
gap={1}
>
<IconButton disabled={pageNumber <= 1} onClick={() => setPageNumber(p => p - 1)}>
<ArrowCircleLeftIcon fontSize="large" />
</IconButton>
<span>{pageNumber} / {numPages}</span>
<IconButton
disabled={pageNumber >= (numPages || 1)}
onClick={() => setPageNumber(p => p + 1)}
>
<ArrowCircleRightIcon fontSize="large" />
</IconButton>
</Box>
</Box>
);
}

View File

@ -11,11 +11,18 @@
// Import Routes // Import Routes
import { Route as rootRoute } from './routes/__root' import { Route as rootRoute } from './routes/__root'
import { Route as ExtractedResultImport } from './routes/extractedResult'
import { Route as ConfigImport } from './routes/config' import { Route as ConfigImport } from './routes/config'
import { Route as IndexImport } from './routes/index' import { Route as IndexImport } from './routes/index'
// Create/Update Routes // Create/Update Routes
const ExtractedResultRoute = ExtractedResultImport.update({
id: '/extractedResult',
path: '/extractedResult',
getParentRoute: () => rootRoute,
} as any)
const ConfigRoute = ConfigImport.update({ const ConfigRoute = ConfigImport.update({
id: '/config', id: '/config',
path: '/config', path: '/config',
@ -46,6 +53,13 @@ declare module '@tanstack/react-router' {
preLoaderRoute: typeof ConfigImport preLoaderRoute: typeof ConfigImport
parentRoute: typeof rootRoute parentRoute: typeof rootRoute
} }
'/extractedResult': {
id: '/extractedResult'
path: '/extractedResult'
fullPath: '/extractedResult'
preLoaderRoute: typeof ExtractedResultImport
parentRoute: typeof rootRoute
}
} }
} }
@ -54,36 +68,41 @@ declare module '@tanstack/react-router' {
export interface FileRoutesByFullPath { export interface FileRoutesByFullPath {
'/': typeof IndexRoute '/': typeof IndexRoute
'/config': typeof ConfigRoute '/config': typeof ConfigRoute
'/extractedResult': typeof ExtractedResultRoute
} }
export interface FileRoutesByTo { export interface FileRoutesByTo {
'/': typeof IndexRoute '/': typeof IndexRoute
'/config': typeof ConfigRoute '/config': typeof ConfigRoute
'/extractedResult': typeof ExtractedResultRoute
} }
export interface FileRoutesById { export interface FileRoutesById {
__root__: typeof rootRoute __root__: typeof rootRoute
'/': typeof IndexRoute '/': typeof IndexRoute
'/config': typeof ConfigRoute '/config': typeof ConfigRoute
'/extractedResult': typeof ExtractedResultRoute
} }
export interface FileRouteTypes { export interface FileRouteTypes {
fileRoutesByFullPath: FileRoutesByFullPath fileRoutesByFullPath: FileRoutesByFullPath
fullPaths: '/' | '/config' fullPaths: '/' | '/config' | '/extractedResult'
fileRoutesByTo: FileRoutesByTo fileRoutesByTo: FileRoutesByTo
to: '/' | '/config' to: '/' | '/config' | '/extractedResult'
id: '__root__' | '/' | '/config' id: '__root__' | '/' | '/config' | '/extractedResult'
fileRoutesById: FileRoutesById fileRoutesById: FileRoutesById
} }
export interface RootRouteChildren { export interface RootRouteChildren {
IndexRoute: typeof IndexRoute IndexRoute: typeof IndexRoute
ConfigRoute: typeof ConfigRoute ConfigRoute: typeof ConfigRoute
ExtractedResultRoute: typeof ExtractedResultRoute
} }
const rootRouteChildren: RootRouteChildren = { const rootRouteChildren: RootRouteChildren = {
IndexRoute: IndexRoute, IndexRoute: IndexRoute,
ConfigRoute: ConfigRoute, ConfigRoute: ConfigRoute,
ExtractedResultRoute: ExtractedResultRoute,
} }
export const routeTree = rootRoute export const routeTree = rootRoute
@ -97,7 +116,8 @@ export const routeTree = rootRoute
"filePath": "__root.tsx", "filePath": "__root.tsx",
"children": [ "children": [
"/", "/",
"/config" "/config",
"/extractedResult"
] ]
}, },
"/": { "/": {
@ -105,6 +125,9 @@ export const routeTree = rootRoute
}, },
"/config": { "/config": {
"filePath": "config.tsx" "filePath": "config.tsx"
},
"/extractedResult": {
"filePath": "extractedResult.tsx"
} }
} }
} }

View File

@ -0,0 +1,101 @@
import { Box, Paper, Typography, Button } from '@mui/material';
import {createFileRoute, useNavigate} from '@tanstack/react-router';
import PDFViewer from '../components/pdfViewer';
import ContentPasteIcon from '@mui/icons-material/ContentPaste';
export const Route = createFileRoute('/extractedResult')({
component: ExtractedResultsPage,
});
function ExtractedResultsPage() {
const navigate = useNavigate();
const status: 'green' | 'yellow' | 'red' = 'red';
const statusColor = {
red: '#f43131',
yellow: '#f6ed48',
green: '#3fd942',
}[status];
return (
<Box p={4}>
<Box display="flex" alignItems="center" gap={3}>
<Box
sx={{
width: 45,
height: 45,
borderRadius: '50%',
backgroundColor: statusColor,
top: 32,
left: 32,
}}
/>
<Typography variant="h5" gutterBottom>
Kennzahlen extrahiert aus: <br/><strong>FONDSNAME: TODO</strong>
</Typography>
</Box>
<Box
display="flex"
gap={4}
sx={{
width: '100vw',
maxWidth: '100%',
height: '80vh',
mt: 4,
}}
>
<Paper
elevation={2}
sx={{
width: '45%',
height: '100%',
borderRadius: 2,
backgroundColor: '#eeeeee',
display: 'flex',
alignItems: 'center',
justifyContent: 'center',
}}
>
<Typography color="textSecondary">To-do: Table hierhin</Typography>
</Paper>
<Box
display="flex"
flexDirection="column"
justifyContent="space-between"
gap={5}
sx={{ width: '55%', height: '95%' }}
>
<Paper
elevation={2}
sx={{
height: '100%',
borderRadius: 2,
backgroundColor: '#eeeeee',
display: 'flex',
alignItems: 'center',
justifyContent: 'center',
}}
>
<PDFViewer/>
</Paper>
<Box mt={2} display="flex" justifyContent="flex-end" gap={2}>
<Button
variant="contained"
sx={{ backgroundColor: '#383838' }}
>
<ContentPasteIcon sx={{ fontSize: 18, mr: 1 }} />
Kennzahlenzeile kopieren
</Button>
<Button
variant="contained"
sx={{ backgroundColor: '#383838' }}
onClick={() => navigate({ to: '/' })}
>
Neu hochladen
</Button>
</Box>
</Box>
</Box>
</Box>
);
}