BA-Chatbot/data_service/parser/paper_parser.py


import requests
import xml.etree.ElementTree as ET
GROBID_BASE_URL= "http://localhost:8070/"
GROBID_HEADER_PARSER_URL= "api/processHeaderDocument"

class PaperParser:
    def __init__(self) -> None:
        """
        The PaperParser class is designed to parse the header of PDF papers, converting them to XML format, and then extracting relevant information like title, keywords, and abstract.
        Attributes:
            None
        """
        pass


    def _getXMLHeader(self, file_path):
        """
        Converts the header of a PDF paper to XML using the GROBID service.

        Args:
            file_path (str): The file path of the PDF document.

        Returns:
            str: The XML representation of the paper's header.
        """
        url = GROBID_BASE_URL + GROBID_HEADER_PARSER_URL
        with open(file_path, 'rb') as file:
            response = requests.post(url, files={'input': file})

        if response.status_code == 200:
            return response.text
        else:
            response.raise_for_status()
    def parseHeader(self, file_path):
        """
        Parses the XML header of a paper to extract the title, keywords, and abstract.

        Args:
            file_path (str): The file path of the PDF document.

        Returns:
            dict: A dictionary containing the title, keywords, and abstract of the paper.
        """
        tei_xml=self._getXMLHeader(file_path=file_path)
        # DEBUG WRITE
        with open("tei_xml.txt", "w") as fp:
            fp.write(tei_xml)
        ns = {"tei": "http://www.tei-c.org/ns/1.0"}  # Define the namespace

        root = ET.fromstring(tei_xml)

        # Extract title
        title_element = root.find(".//tei:titleStmt/tei:title", namespaces=ns)
        title = title_element.text if title_element is not None else None

        # Extract keywords
        keywords_elements = root.findall(".//tei:keywords/tei:term", namespaces=ns)
        keywords = [term.text for term in keywords_elements if term.text is not None]

        # Extract abstract
        abstract_element = root.find(".//tei:abstract/tei:p", namespaces=ns)
        abstract_text = abstract_element.text if abstract_element is not None else None

        # Combine information into dictionary
        paper_info = {
            "title": title,
            "keywords": keywords,
            "abstract": abstract_text
        }

        return paper_info