BA-Chatbot/data_service/converter/sql_to_json.py


"""
This script includes a SQL to JSON converter specifically designed for converting data from a university's module database into a JSON format.
It connects to a MySQL database, retrieves module data using a SQL query, and processes this data into a structured JSON format.
NOTE: You have to run  MySQL Database on localhost at the default PORT.
The script removes unnecessary timestamp columns, combines relevant fields to create a content field for each module, and identifies elective modules (Wahlpflichtmodule) based on specific criteria.
The processed data is saved in a JSON file.
"""
import pymysql
import pandas as pd
import json
# Database connection setup
db_connection = pymysql.connect(
    host='localhost',
    user='maydane',
    password='1234',
    db='mydatabase')

# SQL query execution
query = 'SELECT * FROM modul'
df = pd.read_sql(query, con=db_connection)

timestamp_cols = ['changed']

# Remove timestamps, cause they are irrelevant
df = df.drop(columns=timestamp_cols)
# DataFrame in JSON konvertieren
json_data = df.to_dict(orient='records')

# Write JSON to file
with open('data.json', 'w', encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False)
# Verbindung schließen
db_connection.close()

#---------------------------------------------------------
# This file is from a notebook. So this part is antoher script for parsing the json to a suitable format.
# TODO: This can be refactored into the upper script
import json
# Load the data from a JSON file for converting to right format
with open('data.json', 'r') as f:
    data = json.load(f)

# Iterate over the data, combining the fields
for dic in data:
        combined_str = dic.get("name_de", "") + " " + dic.get("inhalte_de", "") + " " + dic.get("kompetenzen_de", "")
        dic["content"] = combined_str
        dic["is_wpm"]= dic.get("semester") == "6/7" and dic.get("pflichtmodul") == 0


# If you want to save the updated data back to the JSON file:
with open('converted_data.json', 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)