BA-Chatbot/data_service/parser/module_handbook_parser/IB_parser.py

119 lines
4.7 KiB
Python
Raw Normal View History

2023-11-15 14:28:48 +01:00
"""
This script is a specialized parser for the IB Modulehandbook, which is a PDF document containing detailed information about various academic modules.
The script processes the handbook to extract and structure essential data such as module names, credits, semesters, professors, learning goals, contents, and recommended literature.
It employs BeautifulSoup for HTML parsing, converting the content into a more accessible JSON format.
The script is tailored to handle specific formatting styles within the handbook, ensuring accurate and comprehensive data extraction. This structured data is valuable for WPM Recommendation
"""
import os
import sys
from dotenv import load_dotenv
load_dotenv()
sys_path = os.environ.get('SYS_PATH')
sys.path.append(sys_path)
from bs4 import BeautifulSoup, PageElement, Tag
from converter.pdf_converter import PDFConverter
import json
from typing import List
BULLET_POINT = '- '
CHAPTER_STYLE = '[style="font-family: NimbusSanL-Bold; font-size:9px"]'
MODULE_TOP_SUMMARY_STYLE = '[style="font-family: NimbusRomNo9L-ReguItal; font-size:9px"]'
summary_points = {
"NAME": "Name",
"CREDITS": "Kreditpunkte",
"SEMESTER": "Semester",
"STUD_PROGRAMS": "Studiengänge"
}
titles = {
"WORK_LOAD": "Arbeitsaufwand (work load)",
"SEMESTER_HOURS": "Semesterwochenstunden",
"GOALS": "Lernziele/Kompetenzen",
"PROFS": "Dozentinnen / Dozenten",
"BOOKS": "Literatur",
"CONTENT": "Inhalte"
}
def get_chapter(start_element):
current_element = start_element.parent.next_sibling
content = ""
while current_element:
found_child = current_element.select_one(
'span[style="font-family: NimbusSanL-Bold; font-size:9px"]')
if found_child and found_child.text.strip() in titles.values():
break
content += current_element.text
if isinstance(current_element.next_sibling, Tag):
current_element = current_element.next_sibling
else:
break
return content.replace("(cid:4)", BULLET_POINT)
def get_all_module_positions(soup: BeautifulSoup):
elements = soup.find_all(
'span', {'style': 'font-family: NimbusSanL-Bold; font-size:11px'})
result = []
for elem in elements:
module = {"index": soup.body.index(elem.parent), "title": elem.text}
result.append(module)
return result
def parse_module(elements: List[PageElement]):
module = {}
for element in elements:
if element.name == "div" and element.select_one(f'span{MODULE_TOP_SUMMARY_STYLE}'):
summary_elem = element.select_one(
f'span{MODULE_TOP_SUMMARY_STYLE}')
summary_text = summary_elem.text.strip()
sibling=summary_elem.parent.next_sibling.text.strip()
if summary_text == summary_points["CREDITS"]:
module["credits"]=sibling
if summary_text == summary_points["SEMESTER"]:
sibling=summary_elem.parent.next_sibling.next_sibling.next_sibling.text.strip()
module["semester"]=sibling
if element.name == "div" and element.select_one(f'span[style="font-family: NimbusSanL-Bold; font-size:9px"]'):
chapter = element.select_one(
'span[style="font-family: NimbusSanL-Bold; font-size:9px"]')
chapter_title = chapter.text.strip()
if chapter_title in titles.values():
if chapter_title == titles["CONTENT"]:
module["content"] = get_chapter(chapter)
if chapter_title == titles["GOALS"]:
module["goals"] = get_chapter(chapter)
if chapter_title == titles["PROFS"]:
module["profs"] = get_chapter(chapter)
if chapter_title == titles["BOOKS"]:
module["books"] = get_chapter(chapter)
return module
def parse_ib_handbook(html: str):
result = []
soup = BeautifulSoup(html, 'html.parser')
modules = get_all_module_positions(soup=soup)
for module, next_module in zip(modules, modules[1:]):
module_elements = soup.body.contents[module["index"]:next_module["index"]]
module_data = parse_module(module_elements)
module_data["title"] = module["title"]
if "content" not in module_data:
if "goals" in module_data:
module_data["content"]= module_data.pop("goals")
else:
module_data["content"]=""
result.append(module_data)
return result
if __name__ == "__main__":
converter = PDFConverter(init_haystack=False)
html = converter.convert_pdf_to_html("../../data/modulhandbuch-ib.pdf")
modules = parse_ib_handbook(html)
with open("output.json", "w", encoding="utf-8") as f:
json.dump(modules, f, ensure_ascii=False)