84 lines
3.6 KiB
Python
84 lines
3.6 KiB
Python
|
"""
|
||
|
The PDFConverter class is a utility for converting PDF files into various formats for subsequent processing.
|
||
|
It employs libraries like pdfminer and pdfplumber to facilitate these conversions.
|
||
|
The class offers functionality to convert PDFs to text using pdfminer, to HTML format, and to extract tables from PDFs with pdfplumber.
|
||
|
While it currently utilizes specific libraries for these tasks, it's structured to potentially integrate other libraries like Camelot for table extraction, as indicated by the commented-out methods.
|
||
|
This converter acts as a critical pre-processing component in various data processing workflows, preparing PDF content for more detailed analysis or content management systems.
|
||
|
"""
|
||
|
from pathlib import Path
|
||
|
from typing import List
|
||
|
from io import StringIO
|
||
|
from pdfminer.high_level import extract_text_to_fp, extract_text
|
||
|
from pdfminer.layout import LAParams
|
||
|
import pdfplumber
|
||
|
import pandas as pd
|
||
|
# import camelot
|
||
|
|
||
|
class PDFConverter:
|
||
|
def __init__(self, init_haystack=True) -> None:
|
||
|
if init_haystack:
|
||
|
from haystack.nodes import PDFToTextConverter
|
||
|
from haystack import Document
|
||
|
self.haystack_converter = PDFToTextConverter(
|
||
|
remove_numeric_tables=True,
|
||
|
valid_languages=["de", "en"]
|
||
|
)
|
||
|
|
||
|
def convert_pdf_to_text_haystack(self, path: Path) -> List:
|
||
|
if self.haystack_converter:
|
||
|
docs = self.haystack_converter.convert(file_path=path, meta=None)
|
||
|
return docs
|
||
|
|
||
|
def convert_pdf_to_text_pdfminer(self, path: Path):
|
||
|
text = extract_text(path)
|
||
|
return text
|
||
|
|
||
|
def convert_pdf_to_html(self, path: Path):
|
||
|
output_string = StringIO()
|
||
|
with open(path, 'rb') as fin:
|
||
|
extract_text_to_fp(fin, output_string,
|
||
|
laparams=LAParams(), output_type='html', codec=None)
|
||
|
return output_string.getvalue()
|
||
|
|
||
|
|
||
|
# Function to convert extracted tables to HTML
|
||
|
def tables_to_html(self, tables):
|
||
|
html_tables = []
|
||
|
for table in tables:
|
||
|
df = pd.DataFrame(table[1:], columns=table[0])
|
||
|
html_table = df.to_html(index=False, border=1, table_id="table_data")
|
||
|
html_tables.append(html_table)
|
||
|
return html_tables
|
||
|
|
||
|
# Function to extract tables from PDF using pdfplumber
|
||
|
def extract_tables_from_pdf(self, pdf_path):
|
||
|
tables = []
|
||
|
with pdfplumber.open(pdf_path) as pdf:
|
||
|
for page in pdf.pages:
|
||
|
page_tables = page.extract_tables()
|
||
|
for table in page_tables:
|
||
|
tables.append(table)
|
||
|
return tables
|
||
|
def convert_pdf_tables_pdfplumber(self,path:Path):
|
||
|
tables = self.extract_tables_from_pdf(path)
|
||
|
html_tables = self.tables_to_html(tables)
|
||
|
return html_tables
|
||
|
|
||
|
# Function to extract tables from PDF using Camelot
|
||
|
# def extract_tables_from_pdf_camelot(self,pdf_path):
|
||
|
# tables = camelot.read_pdf(pdf_path, flavor='stream', pages='all', split_text=True, strip_text='\n')
|
||
|
# return tables
|
||
|
|
||
|
# Function to convert extracted tables to HTML
|
||
|
# def tables_to_html_camelot(self,tables):
|
||
|
# html_tables = []
|
||
|
# for table in tables:
|
||
|
# df = table.df
|
||
|
# html_table = df.to_html(index=False, border=1, table_id="table_data")
|
||
|
# html_tables.append(html_table)
|
||
|
# return html_tables
|
||
|
# def convert_pdf_tables_camelot(self,path:Path):
|
||
|
# tables = self.extract_tables_from_pdf_camelot(path)
|
||
|
# html_tables = self.tables_to_html_camelot(tables)
|
||
|
# return html_tables
|