BA-Chatbot/data_service/parser/paper_parser.py


import requests
import xml.etree.ElementTree as ET
GROBID_BASE_URL= "http://localhost:8070/"
GROBID_HEADER_PARSER_URL= "api/processHeaderDocument"

class PaperParser:
    def __init__(self) -> None:
        """
        The PaperParser class is designed to parse the header of PDF papers, converting them to XML format, and then extracting relevant information like title, keywords, and abstract.
        Attributes:
            None
        """
        pass
    
    
    def _getXMLHeader(self, file_path):
        """
        Converts the header of a PDF paper to XML using the GROBID service.

        Args:
            file_path (str): The file path of the PDF document.

        Returns:
            str: The XML representation of the paper's header.
        """
        url = GROBID_BASE_URL + GROBID_HEADER_PARSER_URL
        with open(file_path, 'rb') as file:
            response = requests.post(url, files={'input': file})
            
        if response.status_code == 200:
            return response.text
        else:
            response.raise_for_status()
    def parseHeader(self, file_path):
        """
        Parses the XML header of a paper to extract the title, keywords, and abstract.

        Args:
            file_path (str): The file path of the PDF document.

        Returns:
            dict: A dictionary containing the title, keywords, and abstract of the paper.
        """
        tei_xml=self._getXMLHeader(file_path=file_path)
        # DEBUG WRITE
        with open("tei_xml.txt", "w") as fp:
            fp.write(tei_xml)
        ns = {"tei": "http://www.tei-c.org/ns/1.0"}  # Define the namespace

        root = ET.fromstring(tei_xml)

        # Extract title
        title_element = root.find(".//tei:titleStmt/tei:title", namespaces=ns)
        title = title_element.text if title_element is not None else None

        # Extract keywords
        keywords_elements = root.findall(".//tei:keywords/tei:term", namespaces=ns)
        keywords = [term.text for term in keywords_elements if term.text is not None]

        # Extract abstract
        abstract_element = root.find(".//tei:abstract/tei:p", namespaces=ns)
        abstract_text = abstract_element.text if abstract_element is not None else None

        # Combine information into dictionary
        paper_info = {
            "title": title,
            "keywords": keywords,
            "abstract": abstract_text
        }

        return paper_info
initial 2023-11-15 14:28:48 +01:00
			`import requests`
			`import xml.etree.ElementTree as ET`
			`GROBID_BASE_URL= "http://localhost:8070/"`
			`GROBID_HEADER_PARSER_URL= "api/processHeaderDocument"`

			`class PaperParser:`
			`def __init__(self) -> None:`
			`"""`
			`The PaperParser class is designed to parse the header of PDF papers, converting them to XML format, and then extracting relevant information like title, keywords, and abstract.`
			`Attributes:`
			`None`
			`"""`
			`pass`


			`def _getXMLHeader(self, file_path):`
			`"""`
			`Converts the header of a PDF paper to XML using the GROBID service.`

			`Args:`
			`file_path (str): The file path of the PDF document.`

			`Returns:`
			`str: The XML representation of the paper's header.`
			`"""`
			`url = GROBID_BASE_URL + GROBID_HEADER_PARSER_URL`
			`with open(file_path, 'rb') as file:`
			`response = requests.post(url, files={'input': file})`

			`if response.status_code == 200:`
			`return response.text`
			`else:`
			`response.raise_for_status()`
			`def parseHeader(self, file_path):`
			`"""`
			`Parses the XML header of a paper to extract the title, keywords, and abstract.`

			`Args:`
			`file_path (str): The file path of the PDF document.`

			`Returns:`
			`dict: A dictionary containing the title, keywords, and abstract of the paper.`
			`"""`
			`tei_xml=self._getXMLHeader(file_path=file_path)`
			`# DEBUG WRITE`
			`with open("tei_xml.txt", "w") as fp:`
			`fp.write(tei_xml)`
			`ns = {"tei": "http://www.tei-c.org/ns/1.0"} # Define the namespace`

			`root = ET.fromstring(tei_xml)`

			`# Extract title`
			`title_element = root.find(".//tei:titleStmt/tei:title", namespaces=ns)`
			`title = title_element.text if title_element is not None else None`

			`# Extract keywords`
			`keywords_elements = root.findall(".//tei:keywords/tei:term", namespaces=ns)`
			`keywords = [term.text for term in keywords_elements if term.text is not None]`

			`# Extract abstract`
			`abstract_element = root.find(".//tei:abstract/tei:p", namespaces=ns)`
			`abstract_text = abstract_element.text if abstract_element is not None else None`

			`# Combine information into dictionary`
			`paper_info = {`
			`"title": title,`
			`"keywords": keywords,`
			`"abstract": abstract_text`
			`}`

			`return paper_info`