""" This script is a specialized parser for the IB Modulehandbook, which is a PDF document containing detailed information about various academic modules. The script processes the handbook to extract and structure essential data such as module names, credits, semesters, professors, learning goals, contents, and recommended literature. It employs BeautifulSoup for HTML parsing, converting the content into a more accessible JSON format. The script is tailored to handle specific formatting styles within the handbook, ensuring accurate and comprehensive data extraction. This structured data is valuable for WPM Recommendation """ import os import sys from dotenv import load_dotenv load_dotenv() sys_path = os.environ.get('SYS_PATH') sys.path.append(sys_path) from bs4 import BeautifulSoup, PageElement, Tag from converter.pdf_converter import PDFConverter import json from typing import List BULLET_POINT = '- ' CHAPTER_STYLE = '[style="font-family: NimbusSanL-Bold; font-size:9px"]' MODULE_TOP_SUMMARY_STYLE = '[style="font-family: NimbusRomNo9L-ReguItal; font-size:9px"]' summary_points = { "NAME": "Name", "CREDITS": "Kreditpunkte", "SEMESTER": "Semester", "STUD_PROGRAMS": "Studiengänge" } titles = { "WORK_LOAD": "Arbeitsaufwand (work load)", "SEMESTER_HOURS": "Semesterwochenstunden", "GOALS": "Lernziele/Kompetenzen", "PROFS": "Dozentinnen / Dozenten", "BOOKS": "Literatur", "CONTENT": "Inhalte" } def get_chapter(start_element): current_element = start_element.parent.next_sibling content = "" while current_element: found_child = current_element.select_one( 'span[style="font-family: NimbusSanL-Bold; font-size:9px"]') if found_child and found_child.text.strip() in titles.values(): break content += current_element.text if isinstance(current_element.next_sibling, Tag): current_element = current_element.next_sibling else: break return content.replace("(cid:4)", BULLET_POINT) def get_all_module_positions(soup: BeautifulSoup): elements = soup.find_all( 'span', {'style': 'font-family: NimbusSanL-Bold; font-size:11px'}) result = [] for elem in elements: module = {"index": soup.body.index(elem.parent), "title": elem.text} result.append(module) return result def parse_module(elements: List[PageElement]): module = {} for element in elements: if element.name == "div" and element.select_one(f'span{MODULE_TOP_SUMMARY_STYLE}'): summary_elem = element.select_one( f'span{MODULE_TOP_SUMMARY_STYLE}') summary_text = summary_elem.text.strip() sibling=summary_elem.parent.next_sibling.text.strip() if summary_text == summary_points["CREDITS"]: module["credits"]=sibling if summary_text == summary_points["SEMESTER"]: sibling=summary_elem.parent.next_sibling.next_sibling.next_sibling.text.strip() module["semester"]=sibling if element.name == "div" and element.select_one(f'span[style="font-family: NimbusSanL-Bold; font-size:9px"]'): chapter = element.select_one( 'span[style="font-family: NimbusSanL-Bold; font-size:9px"]') chapter_title = chapter.text.strip() if chapter_title in titles.values(): if chapter_title == titles["CONTENT"]: module["content"] = get_chapter(chapter) if chapter_title == titles["GOALS"]: module["goals"] = get_chapter(chapter) if chapter_title == titles["PROFS"]: module["profs"] = get_chapter(chapter) if chapter_title == titles["BOOKS"]: module["books"] = get_chapter(chapter) return module def parse_ib_handbook(html: str): result = [] soup = BeautifulSoup(html, 'html.parser') modules = get_all_module_positions(soup=soup) for module, next_module in zip(modules, modules[1:]): module_elements = soup.body.contents[module["index"]:next_module["index"]] module_data = parse_module(module_elements) module_data["title"] = module["title"] if "content" not in module_data: if "goals" in module_data: module_data["content"]= module_data.pop("goals") else: module_data["content"]="" result.append(module_data) return result if __name__ == "__main__": converter = PDFConverter(init_haystack=False) html = converter.convert_pdf_to_html("../../data/modulhandbuch-ib.pdf") modules = parse_ib_handbook(html) with open("output.json", "w", encoding="utf-8") as f: json.dump(modules, f, ensure_ascii=False)