Compare commits

...

4 Commits

Author SHA1 Message Date
s8613 63882d77c0 Merge conflict resolved 2025-06-04 20:25:54 +02:00
s8613 96ad5fd15c Merge remote-tracking branch 'origin/main' into #13-OCR-Service
# Conflicts:
#	project/frontend/src/components/pdfViewer.tsx
2025-06-04 20:19:46 +02:00
Jaronim Pracht f81624b8ab Merge pull request 'Pdf Seite ist anspringbar durch Kennzahlen-Tabelle' (#50) from #21-seite-anspringen into main
Reviewed-on: #50
2025-06-04 19:45:07 +02:00
Zainab2604 93334898c9 Pdf Seite ist anspringbar durch Kennzahlen-Tabelle 2025-06-03 22:03:21 +02:00
6 changed files with 100 additions and 69 deletions

70
project/README.md 100644
View File

@ -0,0 +1,70 @@
# PSE2 - Pitchbook Extraction Webapplication
A microservices platform for processing pitchbook PDFs using OCR and entity extraction services. Combines SpaCy NLP and GPT-based (ExxetaGPT) extraction of kpi in Pitchbooks.
```
## Quick Start
### 1. Environment Setup
Create a `.env` file in the project root:
# Database
DATABASE_URL=url
POSTGRES_USER=admin
POSTGRES_PASSWORD=password
# API Key (required for ExxetaGPT service)
API_KEY=your_exxeta_jwt_token_here
```
### 2. Start Application
```bash
# Build and start all services
docker-compose up --build
# Run in background
docker-compose up --build -d
# Stop services
docker-compose down
```
### 3. Access Application
- **Frontend:** http://localhost:8080
- **API:** http://localhost:5050
## Services Overview
| Service | Port | Purpose |
|---------|------|---------|
| **Frontend** | 8080 | React UI for file upload and results display |
| **Coordinator** | 5050 | Main API, file storage, database management |
| **OCR** | 5051 | PDF text extraction using OCRmyPDF |
| **ExxetaGPT** | 5053 | AI entity extraction using GPT-4o-mini |
| **SpaCy** | 5052 | NLP entity extraction using custom model |
| **Validate** | 5054 | Merges and validates results from both extractors |
| **Database** | 5432 | PostgreSQL for data persistence |
## Usage Flow
1. Upload PDF via web interface
2. OCR service extracts text from PDF
3. Both ExxetaGPT and SpaCy services extract kpi's entities
4. Validate service merges and validates results
5. View extracted kpi's and original PDF side-by-side
## Troubleshooting
**Services won't start:**
```bash
# Check logs
docker-compose logs
```
**ExxetaGPT errors:**
- Ensure `API_KEY` is set in `.env` file
- Check API key validity and network access
**Database connection issues:**
- Wait for database health check to pass
- Verify `DATABASE_URL` format in `.env`

View File

@ -1,41 +0,0 @@
# ExxetaGPT Microservice
## Lokaler Start (ohne Container)
### 1. Voraussetzungen
- Python 3.11+
- Virtuelle Umgebung (empfohlen)
```bash
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt
```
### 2. .env Datei erstellen
Leg eine .env Datei im Projektverzeichnis mit der Exxeta API-Key an
(Der API Key ist ein JWT von Exxeta nicht veröffentlichen!)
### 3. Starten
python app.py
## Verwendung als Docker-Container
### 1. Build
```bash
docker build -t exxeta-gpt .
```
### 2. Starten
```bash
docker run -p 5050:5050 --env-file .env exxeta-gpt
```
## Beispielaufruf:
```bash
curl -X POST http://localhost:5050/extract \
-H "Content-Type: application/json" \
-d @text-per-page.json
```

View File

@ -1,20 +0,0 @@
# SpaCy Microservice
## Den Service mit in einem Docker-Container starten
### 1. Build
```bash
docker build -t spacy-service .
```
### 2. Starten
```bash
docker run -p 5050:5050 spacy-service
```
## Beispielaufruf:
```bash
curl -X POST http://localhost:5050/extraction \
-H "Content-Type: application/json" \
-d @text-per-page.json
```

View File

@ -2,7 +2,7 @@ import {
Table, TableBody, TableCell, TableContainer, Table, TableBody, TableCell, TableContainer,
TableHead, TableRow, Paper, Box, TableHead, TableRow, Paper, Box,
Dialog, DialogActions, DialogContent, DialogTitle, Dialog, DialogActions, DialogContent, DialogTitle,
TextField, Button TextField, Button, Link
} from '@mui/material'; } from '@mui/material';
import ErrorOutlineIcon from '@mui/icons-material/ErrorOutline'; import ErrorOutlineIcon from '@mui/icons-material/ErrorOutline';
import SearchIcon from '@mui/icons-material/Search'; import SearchIcon from '@mui/icons-material/Search';
@ -15,11 +15,16 @@ import {
{ label: 'Fondsname', value: 'Fund Real Estate Prime Europe', page: 1, status: 'ok' }, { label: 'Fondsname', value: 'Fund Real Estate Prime Europe', page: 1, status: 'ok' },
{ label: 'Fondsmanager', value: '', page: 1, status: 'error' }, { label: 'Fondsmanager', value: '', page: 1, status: 'error' },
{ label: 'Risikoprofil', value: 'Core/Core+', page: 10, status: 'warning' }, { label: 'Risikoprofil', value: 'Core/Core+', page: 10, status: 'warning' },
{ label: 'LTV', value: '30-35 %', page: 8, status: 'ok' } { label: 'LTV', value: '30-35 %', page: 8, status: 'ok' },
{ label: 'Ausschüttungsrendite', value: '4%', page: 34, status: 'ok' }
]; ];
interface KennzahlenTableProps {
onPageClick?: (page: number) => void;
}
// React-Komponente // React-Komponente
export default function KennzahlenTable() { export default function KennzahlenTable({ onPageClick }: KennzahlenTableProps) {
// Zustand für bearbeitbare Daten // Zustand für bearbeitbare Daten
const [rows, setRows] = useState(exampleData); const [rows, setRows] = useState(exampleData);
@ -100,7 +105,15 @@ import {
</TableCell> </TableCell>
{/* Seitenzahl */} {/* Seitenzahl */}
<TableCell>{row.page}</TableCell> <TableCell>
<Link
component="button"
onClick={() => onPageClick?.(row.page)}
sx={{ cursor: 'pointer' }}
>
{row.page}
</Link>
</TableCell>
</TableRow> </TableRow>
); );
})} })}

View File

@ -9,11 +9,12 @@ import { socket } from "../socket";
interface PDFViewerProps { interface PDFViewerProps {
pitchBookId: string; pitchBookId: string;
currentPage?: number;
} }
export default function PDFViewer({ pitchBookId }: PDFViewerProps) { export default function PDFViewer({ pitchBookId, currentPage }: PDFViewerProps) {
const [numPages, setNumPages] = useState<number | null>(null); const [numPages, setNumPages] = useState<number | null>(null);
const [pageNumber, setPageNumber] = useState(1); const [pageNumber, setPageNumber] = useState(currentPage || 1);
const [containerWidth, setContainerWidth] = useState<number | null>(null); const [containerWidth, setContainerWidth] = useState<number | null>(null);
const [pdfKey, setPdfKey] = useState(Date.now()); const [pdfKey, setPdfKey] = useState(Date.now());
const containerRef = useRef<HTMLDivElement>(null); const containerRef = useRef<HTMLDivElement>(null);
@ -34,6 +35,12 @@ export default function PDFViewer({ pitchBookId }: PDFViewerProps) {
return () => window.removeEventListener("resize", updateWidth); return () => window.removeEventListener("resize", updateWidth);
}, []); }, []);
useEffect(() => {
if (currentPage && currentPage !== pageNumber) {
setPageNumber(currentPage);
}
}, [currentPage]);
useEffect(() => { useEffect(() => {
const handleProgress = (data: { id: number; progress: number }) => { const handleProgress = (data: { id: number; progress: number }) => {
if (data.id.toString() === pitchBookId && data.progress === 50) { if (data.id.toString() === pitchBookId && data.progress === 50) {

View File

@ -1,6 +1,7 @@
import ContentPasteIcon from "@mui/icons-material/ContentPaste"; import ContentPasteIcon from "@mui/icons-material/ContentPaste";
import { Box, Button, Paper, Typography } from "@mui/material"; import { Box, Button, Paper, Typography } from "@mui/material";
import { createFileRoute, useNavigate } from "@tanstack/react-router"; import { createFileRoute, useNavigate } from "@tanstack/react-router";
import { useState } from "react";
import KennzahlenTable from "../components/KennzahlenTable"; import KennzahlenTable from "../components/KennzahlenTable";
import PDFViewer from "../components/pdfViewer"; import PDFViewer from "../components/pdfViewer";
@ -12,6 +13,7 @@ function ExtractedResultsPage() {
const { pitchBook } = Route.useParams(); const { pitchBook } = Route.useParams();
const navigate = useNavigate(); const navigate = useNavigate();
const status: "green" | "yellow" | "red" = "red"; const status: "green" | "yellow" | "red" = "red";
const [currentPage, setCurrentPage] = useState(1);
const statusColor = { const statusColor = {
red: "#f43131", red: "#f43131",
@ -58,7 +60,7 @@ function ExtractedResultsPage() {
overflow: "auto", overflow: "auto",
}} }}
> >
<KennzahlenTable /> <KennzahlenTable onPageClick={setCurrentPage} />
</Paper> </Paper>
<Box <Box
display="flex" display="flex"
@ -78,7 +80,7 @@ function ExtractedResultsPage() {
justifyContent: "center", justifyContent: "center",
}} }}
> >
<PDFViewer pitchBookId={pitchBook} /> <PDFViewer pitchBookId={pitchBook} currentPage={currentPage} />
</Paper> </Paper>
<Box mt={2} display="flex" justifyContent="flex-end" gap={2}> <Box mt={2} display="flex" justifyContent="flex-end" gap={2}>
<Button variant="contained" sx={{ backgroundColor: "#383838" }}> <Button variant="contained" sx={{ backgroundColor: "#383838" }}>