73 lines
2.4 KiB
Python
73 lines
2.4 KiB
Python
|
|
||
|
import requests
|
||
|
import xml.etree.ElementTree as ET
|
||
|
GROBID_BASE_URL= "http://localhost:8070/"
|
||
|
GROBID_HEADER_PARSER_URL= "api/processHeaderDocument"
|
||
|
|
||
|
class PaperParser:
|
||
|
def __init__(self) -> None:
|
||
|
"""
|
||
|
The PaperParser class is designed to parse the header of PDF papers, converting them to XML format, and then extracting relevant information like title, keywords, and abstract.
|
||
|
Attributes:
|
||
|
None
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
|
||
|
def _getXMLHeader(self, file_path):
|
||
|
"""
|
||
|
Converts the header of a PDF paper to XML using the GROBID service.
|
||
|
|
||
|
Args:
|
||
|
file_path (str): The file path of the PDF document.
|
||
|
|
||
|
Returns:
|
||
|
str: The XML representation of the paper's header.
|
||
|
"""
|
||
|
url = GROBID_BASE_URL + GROBID_HEADER_PARSER_URL
|
||
|
with open(file_path, 'rb') as file:
|
||
|
response = requests.post(url, files={'input': file})
|
||
|
|
||
|
if response.status_code == 200:
|
||
|
return response.text
|
||
|
else:
|
||
|
response.raise_for_status()
|
||
|
def parseHeader(self, file_path):
|
||
|
"""
|
||
|
Parses the XML header of a paper to extract the title, keywords, and abstract.
|
||
|
|
||
|
Args:
|
||
|
file_path (str): The file path of the PDF document.
|
||
|
|
||
|
Returns:
|
||
|
dict: A dictionary containing the title, keywords, and abstract of the paper.
|
||
|
"""
|
||
|
tei_xml=self._getXMLHeader(file_path=file_path)
|
||
|
# DEBUG WRITE
|
||
|
with open("tei_xml.txt", "w") as fp:
|
||
|
fp.write(tei_xml)
|
||
|
ns = {"tei": "http://www.tei-c.org/ns/1.0"} # Define the namespace
|
||
|
|
||
|
root = ET.fromstring(tei_xml)
|
||
|
|
||
|
# Extract title
|
||
|
title_element = root.find(".//tei:titleStmt/tei:title", namespaces=ns)
|
||
|
title = title_element.text if title_element is not None else None
|
||
|
|
||
|
# Extract keywords
|
||
|
keywords_elements = root.findall(".//tei:keywords/tei:term", namespaces=ns)
|
||
|
keywords = [term.text for term in keywords_elements if term.text is not None]
|
||
|
|
||
|
# Extract abstract
|
||
|
abstract_element = root.find(".//tei:abstract/tei:p", namespaces=ns)
|
||
|
abstract_text = abstract_element.text if abstract_element is not None else None
|
||
|
|
||
|
# Combine information into dictionary
|
||
|
paper_info = {
|
||
|
"title": title,
|
||
|
"keywords": keywords,
|
||
|
"abstract": abstract_text
|
||
|
}
|
||
|
|
||
|
return paper_info
|
||
|
|