diff --git a/project/.env.template b/project/.env.template
index 7be0ac1..e7fe96c 100644
--- a/project/.env.template
+++ b/project/.env.template
@@ -2,3 +2,4 @@ API_KEY=
DATABASE_URL=postgresql://admin:admin@db:5432
POSTGRES_PASSWORD=admin
POSTGRES_USER=admin
+COORDINATOR_URL="coordinator:5000"
diff --git a/project/Dockerfile b/project/Dockerfile
index 979c701..a1af038 100644
--- a/project/Dockerfile
+++ b/project/Dockerfile
@@ -18,4 +18,4 @@ COPY . .
ENV PYTHONUNBUFFERED=1
EXPOSE 5000
-CMD ["gunicorn", "--bind", "0.0.0.0:5000", "app:app"]
+CMD ["gunicorn","--timeout", "10","--workers", "2", "--bind", "0.0.0.0:5000", "app:app"]
diff --git a/project/backend/exxetaGPT/services/extractExxeta.py b/project/backend/exxetaGPT/services/extractExxeta.py
index e986d02..caa8e0c 100644
--- a/project/backend/exxetaGPT/services/extractExxeta.py
+++ b/project/backend/exxetaGPT/services/extractExxeta.py
@@ -42,7 +42,7 @@ def extract_with_exxeta(pages_json):
" },\n"
" ...\n"
"]\n\n"
- "Nur JSON-Antwort – keine Kommentare, keine Erklärungen.\n\n"
+ "Nur JSON-Antwort - keine Kommentare, keine Erklärungen.\n\n"
f"TEXT:\n{text}"
)
else:
@@ -68,7 +68,7 @@ def extract_with_exxeta(pages_json):
"- LÄNDERALLOKATION (z. B. DEUTSCHLAND, FRANKREICH, etc. inkl. %-Angaben)\n\n"
"WICHTIG:\n"
- "- Gib **nur eine Entität pro Kennzahl** an – keine Listen oder Interpretationen.\n"
+ "- Gib **nur eine Entität pro Kennzahl** an - keine Listen oder Interpretationen.\n"
"- Wenn mehrere Varianten genannt werden (z. B. \"Core und Core+\"), gib sie im Originalformat als **eine entity** an.\n"
"- **Keine Vermutungen oder Ergänzungen**. Wenn keine Information enthalten ist, gib die Kennzahl **nicht aus**.\n"
"- Extrahiere **nur wörtlich vorkommende Inhalte** (keine Berechnungen, keine Zusammenfassungen).\n"
@@ -86,7 +86,7 @@ def extract_with_exxeta(pages_json):
"]\n\n"
f"Falls keine Kennzahl enthalten ist, gib ein leeres Array [] zurück.\n\n"
- f"Nur JSON-Antwort – keine Kommentare, keine Erklärungen, kein Text außerhalb des JSON.\n\n"
+ f"Nur JSON-Antwort - keine Kommentare, keine Erklärungen, kein Text außerhalb des JSON.\n\n"
f"TEXT:\n{text}"
)
diff --git a/project/backend/validate-service/.env.template b/project/backend/validate-service/.env.template
new file mode 100644
index 0000000..94dd768
--- /dev/null
+++ b/project/backend/validate-service/.env.template
@@ -0,0 +1 @@
+COORDINATOR_URL=""
diff --git a/project/backend/validate-service/README.md b/project/backend/validate-service/README.md
new file mode 100644
index 0000000..f4adf62
--- /dev/null
+++ b/project/backend/validate-service/README.md
@@ -0,0 +1,97 @@
+# Validate Service
+
+Ein Flask-basierter Microservice zur asynchronen Verarbeitung und Validierung von Entitäten aus zwei verschiedenen NLP-Services (SpaCy und Exxeta).
+
+## Funktionsweise
+
+Das Service empfängt für jede eindeutige ID zwei POST-Requests von verschiedenen Services:
+1. **SpaCy Service** - sendet extrahierte Entitäten
+2. **Exxeta Service** - sendet extrahierte Entitäten
+
+Beim ersten Request werden die Daten zwischengespeichert. Beim zweiten Request startet die asynchrone Verarbeitung. Nach der Verarbeitung werden die Ergebnisse an einen nachgelagerten Service weitergeleitet.
+
+## API Endpoints
+
+### POST /validate
+
+Empfängt Entitätsdaten von SpaCy oder Exxeta Services.
+
+**Request Body:**
+```json
+{
+ "id": "pitch_book_id",
+ "service": "spacy|exxeta",
+ "entities": [
+ {
+ "label": "PERSON",
+ "entity": "Max Mustermann",
+ "page": 1
+ }
+ ]
+}
+```
+
+**Response:**
+- **200**: Daten erfolgreich verarbeitet
+- **400**: Fehlende oder ungültige Parameter
+- **500**: Serverfehler
+
+## Installation und Start
+
+1. **Abhängigkeiten installieren:**
+```bash
+pip install -r requirements.txt
+```
+
+2. **Service starten:**
+```bash
+python app.py
+```
+
+Das Service läuft standardmäßig auf `http://localhost:5050`
+
+## Konfiguration
+
+Umgebungsvariablen in `config.py`:
+
+- `COORDINATOR_URL`: URL des Koordinators
+
+## Verarbeitungslogik
+
+1. **Zwischenspeicherung**: Beim ersten Request wird das JSON in einem Thread-sicheren Dictionary gespeichert
+2. **Trigger**: Beim zweiten Request wird die asynchrone Verarbeitung gestartet
+3. **Merge & Validate**: Die `merge_and_validate_entities` Funktion führt die Validierung durch:
+ - Normalisiert Entitäten (entfernt Zeilenumbrüche, konvertiert zu lowercase)
+ - Matched Entitäten basierend auf Label, normalisiertem Text und Seitenzahl
+ - Kennzeichnet Entitäten als "validated" (beide Services) oder "single-source"
+4. **Weiterleitung**: Ergebnisse werden an den nächsten Service gesendet
+5. **Cleanup**: Verarbeitete Daten werden aus dem Speicher entfernt
+
+## Architektur
+
+```
+┌─────────────────┐ ┌─────────────────┐
+│ SpaCy Service │ │ Exxeta Service │
+└─────────┬───────┘ └─────────┬───────┘
+ │ │
+ │ POST /validate │ POST /validate
+ │ (service_type:spacy) │ (service_type:exxeta)
+ ▼ ▼
+ ┌─────────────────────────────────────┐
+ │ Validate Service │
+ │ ┌─────────────────────────────┐ │
+ │ │ Zwischenspeicher │ │
+ │ │ (Thread-safe Dictionary) │ │
+ │ └─────────────────────────────┘ │
+ │ ┌─────────────────────────────┐ │
+ │ │ Asynchrone Verarbeitung │ │
+ │ │ (merge_and_validate_entities)│ │
+ │ └─────────────────────────────┘ │
+ └─────────────┬───────────────────────┘
+ │
+ │ POST (processed data)
+ ▼
+ ┌─────────────────────────────┐
+ │ Nachgelagerter Service │
+ └─────────────────────────────┘
+```
diff --git a/project/backend/validate-service/app.py b/project/backend/validate-service/app.py
new file mode 100644
index 0000000..6bd5e45
--- /dev/null
+++ b/project/backend/validate-service/app.py
@@ -0,0 +1,130 @@
+from flask import Flask, request, jsonify
+import threading
+from merge_logic import merge_entities
+from validate_logic import validate_entities
+from dotenv import load_dotenv
+import os
+import requests
+import json
+
+app = Flask(__name__)
+
+load_dotenv()
+coordinator_url = os.getenv("COORDINATOR_URL") or ""
+
+# todo add persistence layer
+data_storage = {} # {id: {spacy_data: [], exxeta_data: []}}
+
+storage_lock = threading.Lock()
+
+
+def send_to_coordinator_service(processed_data, request_id):
+ if not coordinator_url:
+ print("Not processed, missing url", processed_data)
+ return
+
+ try:
+ payload = {
+ "kpi": json.dumps(processed_data),
+ }
+ requests.put(
+ "http://" + coordinator_url + "/api/pitch_book/" + str(request_id),
+ data=payload,
+ )
+ print(f"Result PitchBook {request_id} sent to coordinator")
+
+ except Exception as e:
+ print(f"Error sending ID {request_id}: {e}")
+
+
+def process_data_async(request_id, spacy_data, exxeta_data):
+ try:
+ print(f"Start asynchronous processing for PitchBook: {request_id}")
+
+ # Perform merge
+ merged_entities = merge_entities(spacy_data, exxeta_data)
+ valid_entities = validate_entities(merged_entities)
+
+ # Send result to next service
+ send_to_coordinator_service(valid_entities, request_id)
+
+ # Remove processed data from storage
+ with storage_lock:
+ if request_id in data_storage:
+ del data_storage[request_id]
+
+ except Exception as e:
+ print(f"Error during asynchronous processing for ID {request_id}: {e}")
+
+
+@app.route("/validate", methods=["POST"])
+def validate():
+ try:
+ json_data = request.get_json()
+
+ if not json_data:
+ return jsonify({"error": "Missing JSON data"}), 400
+
+ # extract ID and service_type from the data
+ request_id = json_data.get("id")
+ service_type = json_data.get("service") # 'spacy' or 'exxeta'
+ entities = json_data.get("entities", [])
+
+ if not request_id or not service_type:
+ return jsonify({"error": "ID and service_type are required"}), 400
+
+ if service_type not in ["spacy", "exxeta"]:
+ return jsonify({"error": "service_type has to be 'spacy' or 'exxeta'"}), 400
+
+ with storage_lock:
+ # Initialize entry if not already present
+ if request_id not in data_storage:
+ data_storage[request_id] = {
+ "spacy_data": None,
+ "exxeta_data": None,
+ }
+
+ # Store the data based on the service type
+ data_storage[request_id][f"{service_type}_data"] = entities
+
+ # Check if both datasets are present
+ stored_data = data_storage[request_id]
+ spacy_data = stored_data["spacy_data"]
+ exxeta_data = stored_data["exxeta_data"]
+
+ # If both datasets are present, start asynchronous processing
+ if spacy_data is not None and exxeta_data is not None:
+
+ # Start asynchronous processing in a separate thread
+ processing_thread = threading.Thread(
+ target=process_data_async,
+ args=(request_id, spacy_data, exxeta_data),
+ daemon=True,
+ )
+ processing_thread.start()
+
+ return (
+ jsonify(
+ {
+ "message": f"Second dataset for ID {request_id} received. Processing started.",
+ }
+ ),
+ 200,
+ )
+ else:
+ return (
+ jsonify(
+ {
+ "message": f"First dataset for ID {request_id} from {service_type} stored. Waiting for second dataset.",
+ }
+ ),
+ 200,
+ )
+
+ except Exception as e:
+ print(f"Error occurred: {str(e)}")
+ return jsonify({"error": f"Fehler: {str(e)}"}), 500
+
+
+if __name__ == "__main__":
+ app.run(debug=True, host="0.0.0.0", port=5050)
diff --git a/project/backend/validate-service/merge_logic.py b/project/backend/validate-service/merge_logic.py
new file mode 100644
index 0000000..1bc404c
--- /dev/null
+++ b/project/backend/validate-service/merge_logic.py
@@ -0,0 +1,68 @@
+def normalize_entity(entity_str):
+ if not entity_str:
+ return ""
+ normalized = entity_str.replace("\n", " ")
+ normalized = "".join(normalized.lower().split())
+ return normalized
+
+
+def merge_entities(spacy_data, exxeta_data):
+ merged = []
+ seen = set()
+
+ # Process SpaCy entities first
+ for s in spacy_data:
+ s_entity_norm = normalize_entity(s["entity"])
+ s_page = s["page"]
+
+ # Look for matching Exxeta entities
+ found = False
+ for e in exxeta_data:
+ e_entity_norm = normalize_entity(e["entity"])
+ e_page = e["page"]
+
+ # Match if normalized entity and page match
+ if (
+ s["label"] == e["label"]
+ and s_entity_norm == e_entity_norm
+ and s_page == e_page
+ ):
+
+ merged.append(
+ {
+ "label": s["label"],
+ "entity": s["entity"],
+ "page": s_page,
+ "status": "validated",
+ }
+ )
+ seen.add((e["entity"], e_page))
+ found = True
+ break
+
+ # If no match found, add as single-source
+ if not found:
+ merged.append(
+ {
+ "label": s["label"],
+ "entity": s["entity"],
+ "page": s_page,
+ "status": "single-source",
+ "source": "spacy",
+ }
+ )
+
+ # Add remaining Exxeta entities not already processed
+ for e in exxeta_data:
+ if (e["entity"], e["page"]) not in seen:
+ merged.append(
+ {
+ "label": e["label"],
+ "entity": e["entity"],
+ "page": e["page"],
+ "status": "single-source",
+ "source": "exxeta",
+ }
+ )
+
+ return merged
diff --git a/project/backend/validate-service/requirements.txt b/project/backend/validate-service/requirements.txt
new file mode 100644
index 0000000..c637b6e
--- /dev/null
+++ b/project/backend/validate-service/requirements.txt
@@ -0,0 +1,14 @@
+blinker==1.9.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.2.1
+dotenv==0.9.9
+Flask==3.1.1
+idna==3.10
+itsdangerous==2.2.0
+Jinja2==3.1.6
+MarkupSafe==3.0.2
+python-dotenv==1.1.0
+requests==2.32.3
+urllib3==2.4.0
+Werkzeug==3.1.3
diff --git a/project/backend/validate-service/validate_logic.py b/project/backend/validate-service/validate_logic.py
new file mode 100644
index 0000000..4685790
--- /dev/null
+++ b/project/backend/validate-service/validate_logic.py
@@ -0,0 +1,12 @@
+def validate_entities(entities):
+ return entities
+ #todo
+ valid = []
+ for entity in entities:
+ if entity["type"] == "PERSON":
+ if entity["name"] == "John Doe":
+ valid.append(entity)
+ elif entity["type"] == "ORG":
+ if entity["name"] == "Exxeta":
+ valid.append(entity)
+ return valid
diff --git a/project/docker-compose.yml b/project/docker-compose.yml
index e4877ba..5615026 100644
--- a/project/docker-compose.yml
+++ b/project/docker-compose.yml
@@ -30,7 +30,7 @@ services:
timeout: 5s
retries: 5
ports:
- - 5000:5000
+ - 5050:5000
spacy:
build:
@@ -42,3 +42,11 @@ services:
dockerfile: ../../Dockerfile
env_file:
- .env
+ validate:
+ build:
+ context: backend/validate-service
+ dockerfile: ../../Dockerfile
+ env_file:
+ - .env
+ ports:
+ - 5051:5000
diff --git a/project/frontend/public/example.pdf b/project/frontend/public/example.pdf
new file mode 100644
index 0000000..759093b
Binary files /dev/null and b/project/frontend/public/example.pdf differ
diff --git a/project/frontend/src/components/UploadPage.tsx b/project/frontend/src/components/UploadPage.tsx
index 172ce61..6916b27 100644
--- a/project/frontend/src/components/UploadPage.tsx
+++ b/project/frontend/src/components/UploadPage.tsx
@@ -26,7 +26,7 @@ export default function UploadPage() {
px={2}
>
navigate({ to: '/config' })}>
-
+
alert('Kein Backend, aber Button klickbar')}
+ onClick={() => navigate({ to: '/extractedResult' })}
>
Kennzahlen extrahieren
diff --git a/project/frontend/src/components/pdfViewer.tsx b/project/frontend/src/components/pdfViewer.tsx
new file mode 100644
index 0000000..2e8f7d1
--- /dev/null
+++ b/project/frontend/src/components/pdfViewer.tsx
@@ -0,0 +1,91 @@
+import { Document, Page, pdfjs } from "react-pdf";
+import { useState, useRef, useEffect } from 'react';
+import 'react-pdf/dist/esm/Page/AnnotationLayer.css';
+import 'react-pdf/dist/esm/Page/TextLayer.css';
+import { Box, IconButton } from '@mui/material';
+import ArrowCircleLeftIcon from '@mui/icons-material/ArrowCircleLeft';
+import ArrowCircleRightIcon from '@mui/icons-material/ArrowCircleRight';
+import testPDF from '/example.pdf';
+
+pdfjs.GlobalWorkerOptions.workerSrc = new URL(
+ "pdfjs-dist/build/pdf.worker.min.mjs",
+ import.meta.url,
+).toString();
+
+export default function PDFViewer() {
+ const [numPages, setNumPages] = useState(null);
+ const [pageNumber, setPageNumber] = useState(1);
+ const [containerWidth, setContainerWidth] = useState(null);
+
+ const containerRef = useRef(null);
+
+ const onDocumentLoadSuccess = ({ numPages }: { numPages: number }) => {
+ setNumPages(numPages);
+ };
+
+ useEffect(() => {
+ const updateWidth = () => {
+ if (containerRef.current) {
+ setContainerWidth(containerRef.current.offsetWidth);
+ }
+ };
+
+ updateWidth();
+ window.addEventListener('resize', updateWidth);
+ return () => window.removeEventListener('resize', updateWidth);
+ }, []);
+
+ return (
+
+
+ console.error('Es gab ein Fehler beim Laden des PDFs:', error)}
+ onSourceError={(error) => console.error('Ungültige PDF:', error)}>
+ {containerWidth && (
+
+ )}
+
+
+
+ setPageNumber(p => p - 1)}>
+
+
+ {pageNumber} / {numPages}
+ = (numPages || 1)}
+ onClick={() => setPageNumber(p => p + 1)}
+ >
+
+
+
+
+ );
+}
\ No newline at end of file
diff --git a/project/frontend/src/routeTree.gen.ts b/project/frontend/src/routeTree.gen.ts
index fecd2db..cade514 100644
--- a/project/frontend/src/routeTree.gen.ts
+++ b/project/frontend/src/routeTree.gen.ts
@@ -11,11 +11,18 @@
// Import Routes
import { Route as rootRoute } from './routes/__root'
+import { Route as ExtractedResultImport } from './routes/extractedResult'
import { Route as ConfigImport } from './routes/config'
import { Route as IndexImport } from './routes/index'
// Create/Update Routes
+const ExtractedResultRoute = ExtractedResultImport.update({
+ id: '/extractedResult',
+ path: '/extractedResult',
+ getParentRoute: () => rootRoute,
+} as any)
+
const ConfigRoute = ConfigImport.update({
id: '/config',
path: '/config',
@@ -46,6 +53,13 @@ declare module '@tanstack/react-router' {
preLoaderRoute: typeof ConfigImport
parentRoute: typeof rootRoute
}
+ '/extractedResult': {
+ id: '/extractedResult'
+ path: '/extractedResult'
+ fullPath: '/extractedResult'
+ preLoaderRoute: typeof ExtractedResultImport
+ parentRoute: typeof rootRoute
+ }
}
}
@@ -54,36 +68,41 @@ declare module '@tanstack/react-router' {
export interface FileRoutesByFullPath {
'/': typeof IndexRoute
'/config': typeof ConfigRoute
+ '/extractedResult': typeof ExtractedResultRoute
}
export interface FileRoutesByTo {
'/': typeof IndexRoute
'/config': typeof ConfigRoute
+ '/extractedResult': typeof ExtractedResultRoute
}
export interface FileRoutesById {
__root__: typeof rootRoute
'/': typeof IndexRoute
'/config': typeof ConfigRoute
+ '/extractedResult': typeof ExtractedResultRoute
}
export interface FileRouteTypes {
fileRoutesByFullPath: FileRoutesByFullPath
- fullPaths: '/' | '/config'
+ fullPaths: '/' | '/config' | '/extractedResult'
fileRoutesByTo: FileRoutesByTo
- to: '/' | '/config'
- id: '__root__' | '/' | '/config'
+ to: '/' | '/config' | '/extractedResult'
+ id: '__root__' | '/' | '/config' | '/extractedResult'
fileRoutesById: FileRoutesById
}
export interface RootRouteChildren {
IndexRoute: typeof IndexRoute
ConfigRoute: typeof ConfigRoute
+ ExtractedResultRoute: typeof ExtractedResultRoute
}
const rootRouteChildren: RootRouteChildren = {
IndexRoute: IndexRoute,
ConfigRoute: ConfigRoute,
+ ExtractedResultRoute: ExtractedResultRoute,
}
export const routeTree = rootRoute
@@ -97,7 +116,8 @@ export const routeTree = rootRoute
"filePath": "__root.tsx",
"children": [
"/",
- "/config"
+ "/config",
+ "/extractedResult"
]
},
"/": {
@@ -105,6 +125,9 @@ export const routeTree = rootRoute
},
"/config": {
"filePath": "config.tsx"
+ },
+ "/extractedResult": {
+ "filePath": "extractedResult.tsx"
}
}
}
diff --git a/project/frontend/src/routes/extractedResult.tsx b/project/frontend/src/routes/extractedResult.tsx
new file mode 100644
index 0000000..15ce5b8
--- /dev/null
+++ b/project/frontend/src/routes/extractedResult.tsx
@@ -0,0 +1,101 @@
+import { Box, Paper, Typography, Button } from '@mui/material';
+import {createFileRoute, useNavigate} from '@tanstack/react-router';
+import PDFViewer from '../components/pdfViewer';
+import ContentPasteIcon from '@mui/icons-material/ContentPaste';
+
+export const Route = createFileRoute('/extractedResult')({
+ component: ExtractedResultsPage,
+});
+
+function ExtractedResultsPage() {
+ const navigate = useNavigate();
+ const status: 'green' | 'yellow' | 'red' = 'red';
+
+ const statusColor = {
+ red: '#f43131',
+ yellow: '#f6ed48',
+ green: '#3fd942',
+ }[status];
+
+ return (
+
+
+
+
+ Kennzahlen extrahiert aus:
FONDSNAME: TODO
+
+
+
+
+ To-do: Table hierhin
+
+
+
+
+
+
+
+
+
+
+
+
+ );
+}
\ No newline at end of file