import requests import xml.etree.ElementTree as ET GROBID_BASE_URL= "http://localhost:8070/" GROBID_HEADER_PARSER_URL= "api/processHeaderDocument" class PaperParser: def __init__(self) -> None: """ The PaperParser class is designed to parse the header of PDF papers, converting them to XML format, and then extracting relevant information like title, keywords, and abstract. Attributes: None """ pass def _getXMLHeader(self, file_path): """ Converts the header of a PDF paper to XML using the GROBID service. Args: file_path (str): The file path of the PDF document. Returns: str: The XML representation of the paper's header. """ url = GROBID_BASE_URL + GROBID_HEADER_PARSER_URL with open(file_path, 'rb') as file: response = requests.post(url, files={'input': file}) if response.status_code == 200: return response.text else: response.raise_for_status() def parseHeader(self, file_path): """ Parses the XML header of a paper to extract the title, keywords, and abstract. Args: file_path (str): The file path of the PDF document. Returns: dict: A dictionary containing the title, keywords, and abstract of the paper. """ tei_xml=self._getXMLHeader(file_path=file_path) # DEBUG WRITE with open("tei_xml.txt", "w") as fp: fp.write(tei_xml) ns = {"tei": "http://www.tei-c.org/ns/1.0"} # Define the namespace root = ET.fromstring(tei_xml) # Extract title title_element = root.find(".//tei:titleStmt/tei:title", namespaces=ns) title = title_element.text if title_element is not None else None # Extract keywords keywords_elements = root.findall(".//tei:keywords/tei:term", namespaces=ns) keywords = [term.text for term in keywords_elements if term.text is not None] # Extract abstract abstract_element = root.find(".//tei:abstract/tei:p", namespaces=ns) abstract_text = abstract_element.text if abstract_element is not None else None # Combine information into dictionary paper_info = { "title": title, "keywords": keywords, "abstract": abstract_text } return paper_info