128 lines
5.0 KiB
Python
Executable File
128 lines
5.0 KiB
Python
Executable File
|
|
"""
|
|
This script is designed for parsing the content of IEB Modulehandbook, a PDF document containing details about various modules including elective modules (Wahlpflichtmodule - WPMs).
|
|
The script extracts relevant information like module titles, contents, requirements, professors, learning goals, and literature references, converting them into a structured format suitable for information retrieval and analysis.
|
|
The script utilizes BeautifulSoup for HTML parsing, and various functions are defined to accurately extract and organize data from different sections of the handbook, including both regular modules and WPMs.
|
|
This structured data is valuable for WPM Recommendation
|
|
"""
|
|
import os
|
|
import sys
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
sys_path = os.environ.get('SYS_PATH')
|
|
sys.path.append(sys_path)
|
|
from typing import List
|
|
import json
|
|
from converter.pdf_converter import PDFConverter
|
|
from bs4 import BeautifulSoup, PageElement, Tag
|
|
|
|
|
|
|
|
|
|
def extract_title_from_thead(thead):
|
|
title=None
|
|
rows= thead.find_all('tr')
|
|
for row in rows:
|
|
ths= row.find_all("th")
|
|
for idx, th in enumerate(ths):
|
|
txt= th.get_text(strip=True).replace("None", "").strip().strip('\n')
|
|
if txt =="Modul" and len(ths) > idx+1:
|
|
title= ths[idx+1].get_text(strip=True)
|
|
return title
|
|
|
|
# a function that returns True if the text argument is in the tag's text
|
|
def contains_text(tag, text):
|
|
return text in tag.get_text() if tag else False
|
|
|
|
def get_html_wpm_tables(html_file):
|
|
soup = BeautifulSoup(html_file, 'html.parser')
|
|
# find the span tag
|
|
wpm_span_tags = soup.find_all('span', style="font-family: Arial-BoldMT; font-size:14px")
|
|
span_tag = None
|
|
for tag in wpm_span_tags:
|
|
if contains_text(tag, "4.6 Wahlfächer im Hauptstudium"):
|
|
span_tag = tag
|
|
break
|
|
parent = span_tag.find_parent()
|
|
tables_after_span = []
|
|
for sibling in parent.next_siblings:
|
|
print(sibling.name)
|
|
if isinstance(sibling, Tag) and sibling.name == 'table': # check if sibling is a table tag
|
|
tables_after_span.append(sibling)
|
|
|
|
return tables_after_span
|
|
|
|
def parse_meb_handbook_tables( html_tables, is_wpm:bool=False):
|
|
result=[]
|
|
item={}
|
|
for idx, html_table in enumerate(html_tables):
|
|
soup = BeautifulSoup(html_table, 'html.parser')
|
|
rows = soup.find_all('tr')
|
|
thead= soup.find("thead")
|
|
title= extract_title_from_thead(thead=thead)
|
|
if title is not None:
|
|
item["title"] = title
|
|
splitted_table_data={}
|
|
for row in rows[1:]:
|
|
tds=row.find_all('td')
|
|
for idx,td in enumerate(tds):
|
|
txt= td.get_text(strip=True)
|
|
txt = txt.replace("None", "").strip().strip('\n')
|
|
if "title" not in item and txt =="Modul":
|
|
item["title"]= tds[idx+1].get_text(strip=True)
|
|
if txt != "None":
|
|
if txt.startswith("Inhalt"):
|
|
item["content"]= txt
|
|
if txt.startswith("Voraussetzungen"):
|
|
item["requirements"]= txt
|
|
if txt.startswith("Dozent") or txt.startswith("Professor"):
|
|
item["professor"]= txt
|
|
if txt.startswith("Lernziele") or txt.startswith(".Lernziele"):
|
|
item["goals"]= txt
|
|
if txt.startswith("Literatur"):
|
|
if "content" in item:
|
|
item["literatur"]= txt
|
|
else:
|
|
splitted_table_data["literatur"]= txt
|
|
item["wpm"]= is_wpm
|
|
if "content" in item and "literatur" in item :
|
|
result.append(item)
|
|
item={}
|
|
elif "content" in item and "literatur" in splitted_table_data :
|
|
item["literatur"]= splitted_table_data["literatur"]
|
|
splitted_table_data={}
|
|
result.append(item)
|
|
item={}
|
|
|
|
return result
|
|
|
|
if __name__ == "__main__":
|
|
converter = PDFConverter(init_haystack=False)
|
|
|
|
html_file= converter.convert_pdf_to_html("../../data/IEB-Modulhandbuch.pdf")
|
|
module_html_tables = converter.convert_pdf_tables_pdfplumber(
|
|
"../../data/IEB-Modulhandbuch_ohne_wpms.pdf")
|
|
wpm_html_tables = converter.convert_pdf_tables_pdfplumber(
|
|
"../../data/IEB-WPMS.pdf")
|
|
with open("wpm_html_tables.html", "w") as f:
|
|
# Write the HTML code
|
|
f.write("<html>\n")
|
|
f.write("<body>\n")
|
|
for table in wpm_html_tables:
|
|
f.write(table)
|
|
f.write("</body>\n")
|
|
f.write("</html>\n")
|
|
modules= parse_meb_handbook_tables(html_tables=module_html_tables, is_wpm= False)
|
|
with open("modules_meb.json", "w", encoding='utf-8') as write_file:
|
|
json.dump(modules, write_file, ensure_ascii=False)
|
|
wpms= parse_meb_handbook_tables( html_tables=wpm_html_tables, is_wpm= True)
|
|
# print(result)
|
|
print(len(wpms))
|
|
with open("wpms_meb.json", "w", encoding='utf-8') as write_file:
|
|
json.dump(wpms, write_file, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
|