BA-Chatbot/data_service/parser/module_handbook_parser/MEB_parser.py

128 lines
5.0 KiB
Python
Raw Permalink Normal View History

2023-11-15 14:28:48 +01:00
"""
This script is designed for parsing the content of IEB Modulehandbook, a PDF document containing details about various modules including elective modules (Wahlpflichtmodule - WPMs).
The script extracts relevant information like module titles, contents, requirements, professors, learning goals, and literature references, converting them into a structured format suitable for information retrieval and analysis.
The script utilizes BeautifulSoup for HTML parsing, and various functions are defined to accurately extract and organize data from different sections of the handbook, including both regular modules and WPMs.
This structured data is valuable for WPM Recommendation
"""
import os
import sys
from dotenv import load_dotenv
load_dotenv()
sys_path = os.environ.get('SYS_PATH')
sys.path.append(sys_path)
from typing import List
import json
from converter.pdf_converter import PDFConverter
from bs4 import BeautifulSoup, PageElement, Tag
def extract_title_from_thead(thead):
title=None
rows= thead.find_all('tr')
for row in rows:
ths= row.find_all("th")
for idx, th in enumerate(ths):
txt= th.get_text(strip=True).replace("None", "").strip().strip('\n')
if txt =="Modul" and len(ths) > idx+1:
title= ths[idx+1].get_text(strip=True)
return title
# a function that returns True if the text argument is in the tag's text
def contains_text(tag, text):
return text in tag.get_text() if tag else False
def get_html_wpm_tables(html_file):
soup = BeautifulSoup(html_file, 'html.parser')
# find the span tag
wpm_span_tags = soup.find_all('span', style="font-family: Arial-BoldMT; font-size:14px")
span_tag = None
for tag in wpm_span_tags:
if contains_text(tag, "4.6 Wahlfächer im Hauptstudium"):
span_tag = tag
break
parent = span_tag.find_parent()
tables_after_span = []
for sibling in parent.next_siblings:
print(sibling.name)
if isinstance(sibling, Tag) and sibling.name == 'table': # check if sibling is a table tag
tables_after_span.append(sibling)
return tables_after_span
def parse_meb_handbook_tables( html_tables, is_wpm:bool=False):
result=[]
item={}
for idx, html_table in enumerate(html_tables):
soup = BeautifulSoup(html_table, 'html.parser')
rows = soup.find_all('tr')
thead= soup.find("thead")
title= extract_title_from_thead(thead=thead)
if title is not None:
item["title"] = title
splitted_table_data={}
for row in rows[1:]:
tds=row.find_all('td')
for idx,td in enumerate(tds):
txt= td.get_text(strip=True)
txt = txt.replace("None", "").strip().strip('\n')
if "title" not in item and txt =="Modul":
item["title"]= tds[idx+1].get_text(strip=True)
if txt != "None":
if txt.startswith("Inhalt"):
item["content"]= txt
if txt.startswith("Voraussetzungen"):
item["requirements"]= txt
if txt.startswith("Dozent") or txt.startswith("Professor"):
item["professor"]= txt
if txt.startswith("Lernziele") or txt.startswith(".Lernziele"):
item["goals"]= txt
if txt.startswith("Literatur"):
if "content" in item:
item["literatur"]= txt
else:
splitted_table_data["literatur"]= txt
item["wpm"]= is_wpm
if "content" in item and "literatur" in item :
result.append(item)
item={}
elif "content" in item and "literatur" in splitted_table_data :
item["literatur"]= splitted_table_data["literatur"]
splitted_table_data={}
result.append(item)
item={}
return result
if __name__ == "__main__":
converter = PDFConverter(init_haystack=False)
html_file= converter.convert_pdf_to_html("../../data/IEB-Modulhandbuch.pdf")
module_html_tables = converter.convert_pdf_tables_pdfplumber(
"../../data/IEB-Modulhandbuch_ohne_wpms.pdf")
wpm_html_tables = converter.convert_pdf_tables_pdfplumber(
"../../data/IEB-WPMS.pdf")
with open("wpm_html_tables.html", "w") as f:
# Write the HTML code
f.write("<html>\n")
f.write("<body>\n")
for table in wpm_html_tables:
f.write(table)
f.write("</body>\n")
f.write("</html>\n")
modules= parse_meb_handbook_tables(html_tables=module_html_tables, is_wpm= False)
with open("modules_meb.json", "w", encoding='utf-8') as write_file:
json.dump(modules, write_file, ensure_ascii=False)
wpms= parse_meb_handbook_tables( html_tables=wpm_html_tables, is_wpm= True)
# print(result)
print(len(wpms))
with open("wpms_meb.json", "w", encoding='utf-8') as write_file:
json.dump(wpms, write_file, ensure_ascii=False)