BA-Chatbot/data_service/parser/module_handbook_parser/IB_parser.py


"""
This script is a specialized parser for the IB Modulehandbook, which is a PDF document containing detailed information about various academic modules. 
The script processes the handbook to extract and structure essential data such as module names, credits, semesters, professors, learning goals, contents, and recommended literature. 
It employs BeautifulSoup for HTML parsing, converting the content into a more accessible JSON format. 
The script is tailored to handle specific formatting styles within the handbook, ensuring accurate and comprehensive data extraction. This structured data is valuable for WPM Recommendation
"""
import os
import sys
from dotenv import load_dotenv
load_dotenv()
sys_path = os.environ.get('SYS_PATH')
sys.path.append(sys_path)
from bs4 import BeautifulSoup, PageElement, Tag
from converter.pdf_converter import PDFConverter
import json
from typing import List

BULLET_POINT = '- '
CHAPTER_STYLE = '[style="font-family: NimbusSanL-Bold; font-size:9px"]'
MODULE_TOP_SUMMARY_STYLE = '[style="font-family: NimbusRomNo9L-ReguItal; font-size:9px"]'
summary_points = {
    "NAME": "Name",
    "CREDITS": "Kreditpunkte",
    "SEMESTER": "Semester",
    "STUD_PROGRAMS": "Studiengänge"
}

titles = {
    "WORK_LOAD": "Arbeitsaufwand (work load)",
    "SEMESTER_HOURS": "Semesterwochenstunden",
    "GOALS": "Lernziele/Kompetenzen",
    "PROFS": "Dozentinnen / Dozenten",
    "BOOKS": "Literatur",
    "CONTENT": "Inhalte"
}


def get_chapter(start_element):
    current_element = start_element.parent.next_sibling
    content = ""
    while current_element:
        found_child = current_element.select_one(
            'span[style="font-family: NimbusSanL-Bold; font-size:9px"]')
        if found_child and found_child.text.strip() in titles.values():
            break
        content += current_element.text
        if isinstance(current_element.next_sibling, Tag):
            current_element = current_element.next_sibling
        else:
            break

    return content.replace("(cid:4)", BULLET_POINT)


def get_all_module_positions(soup: BeautifulSoup):
    elements = soup.find_all(
        'span', {'style': 'font-family: NimbusSanL-Bold; font-size:11px'})
    result = []
    for elem in elements:
        module = {"index": soup.body.index(elem.parent), "title": elem.text}
        result.append(module)
    return result


def parse_module(elements: List[PageElement]):
    module = {}
    for element in elements:

        if element.name == "div" and element.select_one(f'span{MODULE_TOP_SUMMARY_STYLE}'):
            summary_elem = element.select_one(
                f'span{MODULE_TOP_SUMMARY_STYLE}')
            summary_text = summary_elem.text.strip()
            sibling=summary_elem.parent.next_sibling.text.strip()
            if summary_text == summary_points["CREDITS"]:
                module["credits"]=sibling
            if summary_text == summary_points["SEMESTER"]:
                sibling=summary_elem.parent.next_sibling.next_sibling.next_sibling.text.strip()
                module["semester"]=sibling
        if element.name == "div" and element.select_one(f'span[style="font-family: NimbusSanL-Bold; font-size:9px"]'):
            chapter = element.select_one(
                'span[style="font-family: NimbusSanL-Bold; font-size:9px"]')
            chapter_title = chapter.text.strip()
            if chapter_title in titles.values():
                if chapter_title == titles["CONTENT"]:
                    module["content"] = get_chapter(chapter)
                if chapter_title == titles["GOALS"]:
                    module["goals"] = get_chapter(chapter)
                if chapter_title == titles["PROFS"]:
                    module["profs"] = get_chapter(chapter)
                if chapter_title == titles["BOOKS"]:
                    module["books"] = get_chapter(chapter)
    return module


def parse_ib_handbook(html: str):
    result = []
    soup = BeautifulSoup(html, 'html.parser')
    modules = get_all_module_positions(soup=soup)
    for module, next_module in zip(modules, modules[1:]):
        module_elements = soup.body.contents[module["index"]:next_module["index"]]
        module_data = parse_module(module_elements)
        module_data["title"] = module["title"]
        if "content" not in module_data:
            if "goals" in module_data:
                module_data["content"]= module_data.pop("goals")
            else:
                module_data["content"]=""
        result.append(module_data)
    return result


if __name__ == "__main__":
    converter = PDFConverter(init_haystack=False)
    html = converter.convert_pdf_to_html("../../data/modulhandbuch-ib.pdf")
    modules = parse_ib_handbook(html)
    with open("output.json", "w", encoding="utf-8") as f:
        json.dump(modules, f, ensure_ascii=False)
initial 2023-11-15 14:28:48 +01:00
			`"""`
			`This script is a specialized parser for the IB Modulehandbook, which is a PDF document containing detailed information about various academic modules.`
			`The script processes the handbook to extract and structure essential data such as module names, credits, semesters, professors, learning goals, contents, and recommended literature.`
			`It employs BeautifulSoup for HTML parsing, converting the content into a more accessible JSON format.`
			`The script is tailored to handle specific formatting styles within the handbook, ensuring accurate and comprehensive data extraction. This structured data is valuable for WPM Recommendation`
			`"""`
			`import os`
			`import sys`
			`from dotenv import load_dotenv`
			`load_dotenv()`
			`sys_path = os.environ.get('SYS_PATH')`
			`sys.path.append(sys_path)`
			`from bs4 import BeautifulSoup, PageElement, Tag`
			`from converter.pdf_converter import PDFConverter`
			`import json`
			`from typing import List`

			`BULLET_POINT = '- '`
			`CHAPTER_STYLE = '[style="font-family: NimbusSanL-Bold; font-size:9px"]'`
			`MODULE_TOP_SUMMARY_STYLE = '[style="font-family: NimbusRomNo9L-ReguItal; font-size:9px"]'`
			`summary_points = {`
			`"NAME": "Name",`
			`"CREDITS": "Kreditpunkte",`
			`"SEMESTER": "Semester",`
			`"STUD_PROGRAMS": "Studiengänge"`
			`}`

			`titles = {`
			`"WORK_LOAD": "Arbeitsaufwand (work load)",`
			`"SEMESTER_HOURS": "Semesterwochenstunden",`
			`"GOALS": "Lernziele/Kompetenzen",`
			`"PROFS": "Dozentinnen / Dozenten",`
			`"BOOKS": "Literatur",`
			`"CONTENT": "Inhalte"`
			`}`


			`def get_chapter(start_element):`
			`current_element = start_element.parent.next_sibling`
			`content = ""`
			`while current_element:`
			`found_child = current_element.select_one(`
			`'span[style="font-family: NimbusSanL-Bold; font-size:9px"]')`
			`if found_child and found_child.text.strip() in titles.values():`
			`break`
			`content += current_element.text`
			`if isinstance(current_element.next_sibling, Tag):`
			`current_element = current_element.next_sibling`
			`else:`
			`break`

			`return content.replace("(cid:4)", BULLET_POINT)`


			`def get_all_module_positions(soup: BeautifulSoup):`
			`elements = soup.find_all(`
			`'span', {'style': 'font-family: NimbusSanL-Bold; font-size:11px'})`
			`result = []`
			`for elem in elements:`
			`module = {"index": soup.body.index(elem.parent), "title": elem.text}`
			`result.append(module)`
			`return result`


			`def parse_module(elements: List[PageElement]):`
			`module = {}`
			`for element in elements:`

			`if element.name == "div" and element.select_one(f'span{MODULE_TOP_SUMMARY_STYLE}'):`
			`summary_elem = element.select_one(`
			`f'span{MODULE_TOP_SUMMARY_STYLE}')`
			`summary_text = summary_elem.text.strip()`
			`sibling=summary_elem.parent.next_sibling.text.strip()`
			`if summary_text == summary_points["CREDITS"]:`
			`module["credits"]=sibling`
			`if summary_text == summary_points["SEMESTER"]:`
			`sibling=summary_elem.parent.next_sibling.next_sibling.next_sibling.text.strip()`
			`module["semester"]=sibling`
			`if element.name == "div" and element.select_one(f'span[style="font-family: NimbusSanL-Bold; font-size:9px"]'):`
			`chapter = element.select_one(`
			`'span[style="font-family: NimbusSanL-Bold; font-size:9px"]')`
			`chapter_title = chapter.text.strip()`
			`if chapter_title in titles.values():`
			`if chapter_title == titles["CONTENT"]:`
			`module["content"] = get_chapter(chapter)`
			`if chapter_title == titles["GOALS"]:`
			`module["goals"] = get_chapter(chapter)`
			`if chapter_title == titles["PROFS"]:`
			`module["profs"] = get_chapter(chapter)`
			`if chapter_title == titles["BOOKS"]:`
			`module["books"] = get_chapter(chapter)`
			`return module`


			`def parse_ib_handbook(html: str):`
			`result = []`
			`soup = BeautifulSoup(html, 'html.parser')`
			`modules = get_all_module_positions(soup=soup)`
			`for module, next_module in zip(modules, modules[1:]):`
			`module_elements = soup.body.contents[module["index"]:next_module["index"]]`
			`module_data = parse_module(module_elements)`
			`module_data["title"] = module["title"]`
			`if "content" not in module_data:`
			`if "goals" in module_data:`
			`module_data["content"]= module_data.pop("goals")`
			`else:`
			`module_data["content"]=""`
			`result.append(module_data)`
			`return result`


			`if __name__ == "__main__":`
			`converter = PDFConverter(init_haystack=False)`
			`html = converter.convert_pdf_to_html("../../data/modulhandbuch-ib.pdf")`
			`modules = parse_ib_handbook(html)`
			`with open("output.json", "w", encoding="utf-8") as f:`
			`json.dump(modules, f, ensure_ascii=False)`