BA-Chatbot/data_service/parser/paper_parser.py

73 lines
2.4 KiB
Python
Raw Permalink Normal View History

2023-11-15 14:28:48 +01:00
import requests
import xml.etree.ElementTree as ET
GROBID_BASE_URL= "http://localhost:8070/"
GROBID_HEADER_PARSER_URL= "api/processHeaderDocument"
class PaperParser:
def __init__(self) -> None:
"""
The PaperParser class is designed to parse the header of PDF papers, converting them to XML format, and then extracting relevant information like title, keywords, and abstract.
Attributes:
None
"""
pass
def _getXMLHeader(self, file_path):
"""
Converts the header of a PDF paper to XML using the GROBID service.
Args:
file_path (str): The file path of the PDF document.
Returns:
str: The XML representation of the paper's header.
"""
url = GROBID_BASE_URL + GROBID_HEADER_PARSER_URL
with open(file_path, 'rb') as file:
response = requests.post(url, files={'input': file})
if response.status_code == 200:
return response.text
else:
response.raise_for_status()
def parseHeader(self, file_path):
"""
Parses the XML header of a paper to extract the title, keywords, and abstract.
Args:
file_path (str): The file path of the PDF document.
Returns:
dict: A dictionary containing the title, keywords, and abstract of the paper.
"""
tei_xml=self._getXMLHeader(file_path=file_path)
# DEBUG WRITE
with open("tei_xml.txt", "w") as fp:
fp.write(tei_xml)
ns = {"tei": "http://www.tei-c.org/ns/1.0"} # Define the namespace
root = ET.fromstring(tei_xml)
# Extract title
title_element = root.find(".//tei:titleStmt/tei:title", namespaces=ns)
title = title_element.text if title_element is not None else None
# Extract keywords
keywords_elements = root.findall(".//tei:keywords/tei:term", namespaces=ns)
keywords = [term.text for term in keywords_elements if term.text is not None]
# Extract abstract
abstract_element = root.find(".//tei:abstract/tei:p", namespaces=ns)
abstract_text = abstract_element.text if abstract_element is not None else None
# Combine information into dictionary
paper_info = {
"title": title,
"keywords": keywords,
"abstract": abstract_text
}
return paper_info