BA-Chatbot/data_service/parser/module_handbook_parser/MEB_parser.py


"""
This script is designed for parsing the content of IEB Modulehandbook, a PDF document containing details about various modules including elective modules (Wahlpflichtmodule - WPMs).
The script extracts relevant information like module titles, contents, requirements, professors, learning goals, and literature references, converting them into a structured format suitable for information retrieval and analysis.
The script utilizes BeautifulSoup for HTML parsing, and various functions are defined to accurately extract and organize data from different sections of the handbook, including both regular modules and WPMs.
This structured data is valuable for WPM Recommendation
"""
import os
import sys
from dotenv import load_dotenv
load_dotenv()
sys_path = os.environ.get('SYS_PATH')
sys.path.append(sys_path)
from typing import List
import json
from converter.pdf_converter import PDFConverter
from bs4 import BeautifulSoup, PageElement, Tag


def extract_title_from_thead(thead):
    title=None
    rows= thead.find_all('tr')
    for row in rows:
        ths= row.find_all("th")
        for idx, th in enumerate(ths):
            txt= th.get_text(strip=True).replace("None", "").strip().strip('\n')
            if txt =="Modul" and len(ths) > idx+1:
                title= ths[idx+1].get_text(strip=True)
    return title

# a function that returns True if the text argument is in the tag's text
def contains_text(tag, text):
    return text in tag.get_text() if tag else False

def get_html_wpm_tables(html_file):
    soup = BeautifulSoup(html_file, 'html.parser')
    # find the span tag
    wpm_span_tags = soup.find_all('span', style="font-family: Arial-BoldMT; font-size:14px")
    span_tag = None
    for tag in wpm_span_tags:
        if contains_text(tag, "4.6 Wahlfächer im Hauptstudium"):
            span_tag = tag
            break
    parent = span_tag.find_parent()
    tables_after_span = []
    for sibling in parent.next_siblings:
        print(sibling.name)
        if isinstance(sibling, Tag) and sibling.name == 'table':  # check if sibling is a table tag
            tables_after_span.append(sibling)

    return tables_after_span

def parse_meb_handbook_tables( html_tables, is_wpm:bool=False):
    result=[]
    item={}
    for idx, html_table in enumerate(html_tables):
        soup = BeautifulSoup(html_table, 'html.parser')
        rows = soup.find_all('tr')
        thead= soup.find("thead")
        title= extract_title_from_thead(thead=thead)
        if title is not None:
            item["title"] = title
        splitted_table_data={}
        for row in rows[1:]:
            tds=row.find_all('td')
            for idx,td in enumerate(tds):
                txt= td.get_text(strip=True)
                txt = txt.replace("None", "").strip().strip('\n')
                if "title" not in item and  txt =="Modul":
                    item["title"]= tds[idx+1].get_text(strip=True)
                if txt != "None":
                    if txt.startswith("Inhalt"):
                        item["content"]= txt
                    if txt.startswith("Voraussetzungen"):
                        item["requirements"]= txt
                    if txt.startswith("Dozent") or txt.startswith("Professor"):
                        item["professor"]= txt
                    if txt.startswith("Lernziele") or txt.startswith(".Lernziele"):
                        item["goals"]= txt
                    if txt.startswith("Literatur"):
                        if "content" in item:
                            item["literatur"]= txt
                        else:
                            splitted_table_data["literatur"]= txt
        item["wpm"]= is_wpm
        if "content" in item and "literatur" in item :
            result.append(item)
            item={}
        elif "content" in item and "literatur" in splitted_table_data :
            item["literatur"]= splitted_table_data["literatur"]
            splitted_table_data={}
            result.append(item)
            item={}

    return result

if __name__ == "__main__":
    converter = PDFConverter(init_haystack=False)

    html_file= converter.convert_pdf_to_html("../../data/IEB-Modulhandbuch.pdf")
    module_html_tables = converter.convert_pdf_tables_pdfplumber(
        "../../data/IEB-Modulhandbuch_ohne_wpms.pdf")
    wpm_html_tables = converter.convert_pdf_tables_pdfplumber(
        "../../data/IEB-WPMS.pdf")
    with open("wpm_html_tables.html", "w") as f:
        # Write the HTML code
        f.write("<html>\n")
        f.write("<body>\n")
        for table in wpm_html_tables:
            f.write(table)
        f.write("</body>\n")
        f.write("</html>\n")
    modules= parse_meb_handbook_tables(html_tables=module_html_tables, is_wpm= False)
    with open("modules_meb.json", "w", encoding='utf-8') as write_file:
        json.dump(modules, write_file, ensure_ascii=False)
    wpms= parse_meb_handbook_tables( html_tables=wpm_html_tables, is_wpm= True)
    # print(result)
    print(len(wpms))
    with open("wpms_meb.json", "w", encoding='utf-8') as write_file:
        json.dump(wpms, write_file, ensure_ascii=False)