BA-Chatbot/data_service/converter/pdf_converter.py

84 lines
3.6 KiB
Python
Executable File

"""
The PDFConverter class is a utility for converting PDF files into various formats for subsequent processing.
It employs libraries like pdfminer and pdfplumber to facilitate these conversions.
The class offers functionality to convert PDFs to text using pdfminer, to HTML format, and to extract tables from PDFs with pdfplumber.
While it currently utilizes specific libraries for these tasks, it's structured to potentially integrate other libraries like Camelot for table extraction, as indicated by the commented-out methods.
This converter acts as a critical pre-processing component in various data processing workflows, preparing PDF content for more detailed analysis or content management systems.
"""
from pathlib import Path
from typing import List
from io import StringIO
from pdfminer.high_level import extract_text_to_fp, extract_text
from pdfminer.layout import LAParams
import pdfplumber
import pandas as pd
# import camelot
class PDFConverter:
def __init__(self, init_haystack=True) -> None:
if init_haystack:
from haystack.nodes import PDFToTextConverter
from haystack import Document
self.haystack_converter = PDFToTextConverter(
remove_numeric_tables=True,
valid_languages=["de", "en"]
)
def convert_pdf_to_text_haystack(self, path: Path) -> List:
if self.haystack_converter:
docs = self.haystack_converter.convert(file_path=path, meta=None)
return docs
def convert_pdf_to_text_pdfminer(self, path: Path):
text = extract_text(path)
return text
def convert_pdf_to_html(self, path: Path):
output_string = StringIO()
with open(path, 'rb') as fin:
extract_text_to_fp(fin, output_string,
laparams=LAParams(), output_type='html', codec=None)
return output_string.getvalue()
# Function to convert extracted tables to HTML
def tables_to_html(self, tables):
html_tables = []
for table in tables:
df = pd.DataFrame(table[1:], columns=table[0])
html_table = df.to_html(index=False, border=1, table_id="table_data")
html_tables.append(html_table)
return html_tables
# Function to extract tables from PDF using pdfplumber
def extract_tables_from_pdf(self, pdf_path):
tables = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_tables = page.extract_tables()
for table in page_tables:
tables.append(table)
return tables
def convert_pdf_tables_pdfplumber(self,path:Path):
tables = self.extract_tables_from_pdf(path)
html_tables = self.tables_to_html(tables)
return html_tables
# Function to extract tables from PDF using Camelot
# def extract_tables_from_pdf_camelot(self,pdf_path):
# tables = camelot.read_pdf(pdf_path, flavor='stream', pages='all', split_text=True, strip_text='\n')
# return tables
# Function to convert extracted tables to HTML
# def tables_to_html_camelot(self,tables):
# html_tables = []
# for table in tables:
# df = table.df
# html_table = df.to_html(index=False, border=1, table_id="table_data")
# html_tables.append(html_table)
# return html_tables
# def convert_pdf_tables_camelot(self,path:Path):
# tables = self.extract_tables_from_pdf_camelot(path)
# html_tables = self.tables_to_html_camelot(tables)
# return html_tables