119 lines
4.7 KiB
Python
119 lines
4.7 KiB
Python
|
|
||
|
"""
|
||
|
This script is a specialized parser for the IB Modulehandbook, which is a PDF document containing detailed information about various academic modules.
|
||
|
The script processes the handbook to extract and structure essential data such as module names, credits, semesters, professors, learning goals, contents, and recommended literature.
|
||
|
It employs BeautifulSoup for HTML parsing, converting the content into a more accessible JSON format.
|
||
|
The script is tailored to handle specific formatting styles within the handbook, ensuring accurate and comprehensive data extraction. This structured data is valuable for WPM Recommendation
|
||
|
"""
|
||
|
import os
|
||
|
import sys
|
||
|
from dotenv import load_dotenv
|
||
|
load_dotenv()
|
||
|
sys_path = os.environ.get('SYS_PATH')
|
||
|
sys.path.append(sys_path)
|
||
|
from bs4 import BeautifulSoup, PageElement, Tag
|
||
|
from converter.pdf_converter import PDFConverter
|
||
|
import json
|
||
|
from typing import List
|
||
|
|
||
|
BULLET_POINT = '- '
|
||
|
CHAPTER_STYLE = '[style="font-family: NimbusSanL-Bold; font-size:9px"]'
|
||
|
MODULE_TOP_SUMMARY_STYLE = '[style="font-family: NimbusRomNo9L-ReguItal; font-size:9px"]'
|
||
|
summary_points = {
|
||
|
"NAME": "Name",
|
||
|
"CREDITS": "Kreditpunkte",
|
||
|
"SEMESTER": "Semester",
|
||
|
"STUD_PROGRAMS": "Studiengänge"
|
||
|
}
|
||
|
|
||
|
titles = {
|
||
|
"WORK_LOAD": "Arbeitsaufwand (work load)",
|
||
|
"SEMESTER_HOURS": "Semesterwochenstunden",
|
||
|
"GOALS": "Lernziele/Kompetenzen",
|
||
|
"PROFS": "Dozentinnen / Dozenten",
|
||
|
"BOOKS": "Literatur",
|
||
|
"CONTENT": "Inhalte"
|
||
|
}
|
||
|
|
||
|
|
||
|
def get_chapter(start_element):
|
||
|
current_element = start_element.parent.next_sibling
|
||
|
content = ""
|
||
|
while current_element:
|
||
|
found_child = current_element.select_one(
|
||
|
'span[style="font-family: NimbusSanL-Bold; font-size:9px"]')
|
||
|
if found_child and found_child.text.strip() in titles.values():
|
||
|
break
|
||
|
content += current_element.text
|
||
|
if isinstance(current_element.next_sibling, Tag):
|
||
|
current_element = current_element.next_sibling
|
||
|
else:
|
||
|
break
|
||
|
|
||
|
return content.replace("(cid:4)", BULLET_POINT)
|
||
|
|
||
|
|
||
|
def get_all_module_positions(soup: BeautifulSoup):
|
||
|
elements = soup.find_all(
|
||
|
'span', {'style': 'font-family: NimbusSanL-Bold; font-size:11px'})
|
||
|
result = []
|
||
|
for elem in elements:
|
||
|
module = {"index": soup.body.index(elem.parent), "title": elem.text}
|
||
|
result.append(module)
|
||
|
return result
|
||
|
|
||
|
|
||
|
def parse_module(elements: List[PageElement]):
|
||
|
module = {}
|
||
|
for element in elements:
|
||
|
|
||
|
if element.name == "div" and element.select_one(f'span{MODULE_TOP_SUMMARY_STYLE}'):
|
||
|
summary_elem = element.select_one(
|
||
|
f'span{MODULE_TOP_SUMMARY_STYLE}')
|
||
|
summary_text = summary_elem.text.strip()
|
||
|
sibling=summary_elem.parent.next_sibling.text.strip()
|
||
|
if summary_text == summary_points["CREDITS"]:
|
||
|
module["credits"]=sibling
|
||
|
if summary_text == summary_points["SEMESTER"]:
|
||
|
sibling=summary_elem.parent.next_sibling.next_sibling.next_sibling.text.strip()
|
||
|
module["semester"]=sibling
|
||
|
if element.name == "div" and element.select_one(f'span[style="font-family: NimbusSanL-Bold; font-size:9px"]'):
|
||
|
chapter = element.select_one(
|
||
|
'span[style="font-family: NimbusSanL-Bold; font-size:9px"]')
|
||
|
chapter_title = chapter.text.strip()
|
||
|
if chapter_title in titles.values():
|
||
|
if chapter_title == titles["CONTENT"]:
|
||
|
module["content"] = get_chapter(chapter)
|
||
|
if chapter_title == titles["GOALS"]:
|
||
|
module["goals"] = get_chapter(chapter)
|
||
|
if chapter_title == titles["PROFS"]:
|
||
|
module["profs"] = get_chapter(chapter)
|
||
|
if chapter_title == titles["BOOKS"]:
|
||
|
module["books"] = get_chapter(chapter)
|
||
|
return module
|
||
|
|
||
|
|
||
|
def parse_ib_handbook(html: str):
|
||
|
result = []
|
||
|
soup = BeautifulSoup(html, 'html.parser')
|
||
|
modules = get_all_module_positions(soup=soup)
|
||
|
for module, next_module in zip(modules, modules[1:]):
|
||
|
module_elements = soup.body.contents[module["index"]:next_module["index"]]
|
||
|
module_data = parse_module(module_elements)
|
||
|
module_data["title"] = module["title"]
|
||
|
if "content" not in module_data:
|
||
|
if "goals" in module_data:
|
||
|
module_data["content"]= module_data.pop("goals")
|
||
|
else:
|
||
|
module_data["content"]=""
|
||
|
result.append(module_data)
|
||
|
return result
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
converter = PDFConverter(init_haystack=False)
|
||
|
html = converter.convert_pdf_to_html("../../data/modulhandbuch-ib.pdf")
|
||
|
modules = parse_ib_handbook(html)
|
||
|
with open("output.json", "w", encoding="utf-8") as f:
|
||
|
json.dump(modules, f, ensure_ascii=False)
|