BA-Chatbot/data_service/converter/sql_to_json.py

54 lines
2.0 KiB
Python
Executable File

"""
This script includes a SQL to JSON converter specifically designed for converting data from a university's module database into a JSON format.
It connects to a MySQL database, retrieves module data using a SQL query, and processes this data into a structured JSON format.
NOTE: You have to run MySQL Database on localhost at the default PORT.
The script removes unnecessary timestamp columns, combines relevant fields to create a content field for each module, and identifies elective modules (Wahlpflichtmodule) based on specific criteria.
The processed data is saved in a JSON file.
"""
import pymysql
import pandas as pd
import json
# Database connection setup
db_connection = pymysql.connect(
host='localhost',
user='maydane',
password='1234',
db='mydatabase')
# SQL query execution
query = 'SELECT * FROM modul'
df = pd.read_sql(query, con=db_connection)
timestamp_cols = ['changed']
# Remove timestamps, cause they are irrelevant
df = df.drop(columns=timestamp_cols)
# DataFrame in JSON konvertieren
json_data = df.to_dict(orient='records')
# Write JSON to file
with open('data.json', 'w', encoding="utf-8") as f:
json.dump(json_data, f, ensure_ascii=False)
# Verbindung schließen
db_connection.close()
#---------------------------------------------------------
# This file is from a notebook. So this part is antoher script for parsing the json to a suitable format.
# TODO: This can be refactored into the upper script
import json
# Load the data from a JSON file for converting to right format
with open('data.json', 'r') as f:
data = json.load(f)
# Iterate over the data, combining the fields
for dic in data:
combined_str = dic.get("name_de", "") + " " + dic.get("inhalte_de", "") + " " + dic.get("kompetenzen_de", "")
dic["content"] = combined_str
dic["is_wpm"]= dic.get("semester") == "6/7" and dic.get("pflichtmodul") == 0
# If you want to save the updated data back to the JSON file:
with open('converted_data.json', 'w') as f:
json.dump(data, f, ensure_ascii=False, indent=4)