xgboost model

2024-06-05 23:18:20 +02:00 · 2024-06-05 23:18:20 +02:00 · 52a8e8eeca
parent f924d962b8
commit 52a8e8eeca
10 changed files with 990 additions and 92 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,2 @@
 /data/
-/settings.json
+settings.json
--- a/features.db
+++ b/features.db
--- a/ml_models/best_xgb_model_20240605224209.json
+++ b/ml_models/best_xgb_model_20240605224209.json
--- a/notebooks/features.db
+++ b/notebooks/features.db
--- a/notebooks/ml_xgboost.ipynb
+++ b/notebooks/ml_xgboost.ipynb
--- a/scripts/pycache/data_helper.cpython-310.pyc
+++ b/scripts/pycache/data_helper.cpython-310.pyc
--- a/scripts/pycache/feature_extraction.cpython-310.pyc
+++ b/scripts/pycache/feature_extraction.cpython-310.pyc
--- a/scripts/feature_db_checks.py
+++ b/scripts/feature_db_checks.py
@ -0,0 +1,37 @@
+import sqlite3
+import pandas as pd
+
+
+conn = sqlite3.connect('features.db')
+c = conn.cursor()
+# print names of available tables
+c.execute("SELECT name FROM sqlite_master WHERE type='table';")
+print("Table names: ", c.fetchall())
+
+# for each table in the database, print the number of rows
+for table in ['train', 'test', 'validation']:
+    c.execute(f'SELECT COUNT(*) FROM {table}')
+    print(f"Number of rows in the {table} table: ", c.fetchall()[0][0])
+
+
+
+# print the number of rows in features table
+c.execute('SELECT COUNT(*) FROM features')
+print("Number of rows in the features table: ", c.fetchall()[0][0])
+# print column names
+c.execute('PRAGMA table_info(features)')
+print("Column names in the features table: ", c.fetchall())
+
+# count for each label how many rows there are
+c.execute('SELECT y, COUNT(*) FROM features GROUP BY y')
+print("Number of rows for each label: ", c.fetchall())
+
+
+# Load data from the features table into a DataFrame
+df = pd.read_sql_query("SELECT * FROM features", conn)
+# Now you can work with the data in the df DataFrame
+print(df.head(15))
+
+# close the connection
+
+conn.close()
--- a/scripts/feature_extraction.py
+++ b/scripts/feature_extraction.py
@ -1,13 +1,10 @@
-
-from matplotlib import pyplot as plt
 import wfdb.processing
-import sys
-import json
-import scipy
 import numpy as np
 import neurokit2 as nk
 import math
 import time
+from multiprocessing import Pool
+import sqlite3

 def get_y_value(ecg_cleaned, indecies):
    """
@ -82,97 +79,261 @@ def calculate_axis(record, wave_peak, r_peak_idx, sampling_rate=500, aVF=5, I=0)
    
    return r_axis, t_axis

+def get_features(record, label, sampling_rate=500, used_channels=[0, 1, 2, 3, 4, 5], conn=None):
+    """
+    Extracts the features from the record.
+    Args:
+        record (object): The record object containing the ECG signal.
+        label (str): The label of the record.
+        sampling_rate (int): The sampling rate of the ECG signal.
+    Returns:
+        dict: The dictionary containing the extracted features.
+    """
+    age = record.comments[0].split(' ')[1]
+    gender = record.comments[1].split(' ')[1]
+    if age == 'NaN' or gender == 'NaN':
+        return None
+    features = {}
+    # Extract the features
+    features['y'] = label
+    # Demographic features
+    features['age'] = int(age)
+    features['gender'] = True if gender == 'Male' else False
+    # Signal features
+
+
+    # Delineate the ECG signal
+    try:
+        ecg_signal = record.p_signal[:, 0]
+        ecg_cleaned = nk.ecg_clean(ecg_signal, sampling_rate=sampling_rate)
+        _, rpeaks = nk.ecg_peaks(ecg_cleaned, sampling_rate=sampling_rate)
+        r_peaks = rpeaks['ECG_R_Peaks']
+        _, waves_peak = nk.ecg_delineate(ecg_signal, r_peaks, sampling_rate=sampling_rate, method="peak")
+    except KeyboardInterrupt:
+        conn.close()
+        raise
+    except:
+        return None
+    
+    # TODO Other features and check features
+    atrial_rate = len(waves_peak['ECG_P_Peaks']) * 6
+    ventricular_rate = np.mean(nk.ecg_rate(r_peaks, sampling_rate=sampling_rate, desired_length=len(ecg_cleaned)))
+
+    features['artial_rate'] = atrial_rate
+    features['ventricular_rate'] = ventricular_rate
+
+    qrs_duration = np.nanmean(np.array(waves_peak['ECG_S_Peaks']) - np.array(waves_peak['ECG_Q_Peaks']))
+    features['qrs_duration'] = qrs_duration
+
+    qt_interval = np.nanmean(np.array(waves_peak['ECG_T_Offsets']) - np.array(waves_peak['ECG_Q_Peaks']))
+    features['qt_length'] = qt_interval
+
+    q_peak = waves_peak['ECG_Q_Peaks']
+    s_peak = waves_peak['ECG_S_Peaks']
+    # check if q_peak, r_peak, s_peak are not nan and therefore a solid qrs complex exists
+    qrs_count = [any([math.isnan(q_peak[i]), math.isnan(r_peaks[i]), math.isnan(s_peak[i])]) for i in range(len(q_peak))].count(False)
+    features['qrs_count'] = qrs_count
+
+    features['q_peak'] = np.mean(get_y_value(ecg_cleaned, q_peak))
+
+    r_axis, t_axis = calculate_axis(record, waves_peak, r_peaks, sampling_rate=500, aVF=5, I=0)
+    features['r_axis'] = r_axis
+    features['t_axis'] = t_axis
+
+    return features
+
+def exclude_already_extracted(data_dict, conn):
+    """
+    Exclude the records that are already in the database.
+    Args:
+        data_dict (dict): The dictionary containing the data.
+    Returns:
+        dict: The dictionary containing the unique data.
+    """
+    unique_data_dict = {}
+    record_ids = []
+    for label, data in data_dict.items():
+        for record in data:
+                record_ids.append(record.record_name)
+    # get id column from database and subtract it from record_ids
+    c = conn.cursor()
+    c.execute('SELECT id FROM features')
+    db_ids = c.fetchall()
+    db_ids = [x[0] for x in db_ids]
+    unique_ids = list(set(record_ids) - set(db_ids))
+    for label, data in data_dict.items():
+        unique_data_dict[label] = [record for record in data if record.record_name in unique_ids]
+    return unique_data_dict
+
+
+def process_record(data_dict_item, sampling_rate=500, used_channels=[0, 1, 2, 3, 4, 5], conn=None, c=None):
+    """
+    Process a record to extract the features.
+    Args:
+        data_dict_item (tuple): The tuple containing the data dictionary item.
+        sampling_rate (int): The sampling rate of the ECG signal.
+        used_channels (list): The list of used channels.
+        conn (object): The connection object to the database.
+        c (object): The cursor object.
+    Returns:
+        tuple: The tuple containing the record name and the extracted features.
+    """
+
+    label, record, c, conn = data_dict_item
+    features = get_features(record,label, sampling_rate=sampling_rate, used_channels=used_channels)
+    if features is None:
+        print(f"Failed to extract features for record {record.record_name}")
+        return record.record_name, None
+    # TODO: Insert the record into the database
+    #feature_data[record_name] = features
+                # Define the feature names
+    feature_names = list(features.keys())
+    # Create the table if it doesn't exist
+    c.execute(f"CREATE TABLE IF NOT EXISTS features (id TEXT PRIMARY KEY, {', '.join(feature_names)})")
+    # Insert the record into the table
+    c.execute(f"INSERT INTO features (id, {', '.join(feature_names)}) VALUES (?, {', '.join('?' for _ in feature_names)})", 
+            [record.record_name] + list(features.values()))
+    conn.commit()
+
+
+    return record.record_name, features
+
+def extract_features_parallel(data_dict, num_processes, sampling_rate=500, used_channels=[0, 1, 2, 3, 4, 5]):
+    """
+    Extracts the features from the data.
+    Args:
+        data_dict (dict): The dictionary containing the data.
+        num_processes (int): The number of processes to use.
+    """
+    start_time = time.time()
+    failed_records = []
+    processed_records = 0
+
+    conn = sqlite3.connect('features.db')
+    c = conn.cursor()
+    # get unique data
+    data_dict = exclude_already_extracted(data_dict, conn)
+
+    for label, data in data_dict.items():
+        print(f"Extracting features for {label} with {len(data)} data entries.")
+        with Pool(processes=num_processes) as pool:
+            results = pool.map(process_record, [(label, record, c, conn) for record in data])
+            for result in results:
+                processed_records += 1
+                if processed_records % 100 == 0:
+                    stop_time = time.time()
+                    print(f"Extracted features for {processed_records} records. Time taken: {stop_time - start_time:.2f}s") 
+                    start_time = time.time()
+                try:
+                    record_name, features = result
+                except Exception as exc:
+                    print(f'{record_name} generated an exception: {exc}')
+                else:
+                    if features is not None:
+                        print(f"Extracted features for record {record_name}")
+                    else:
+                        failed_records.append(record_name)
+                        print(f"Sum of failed records: {len(failed_records)}")
+    conn.close()
+
+
+

 def extract_features(data_dict, sampling_rate=500, used_channels=[0, 1, 2, 3, 4, 5]):
    """
    Extracts the features from the data.
    Args:
        data_dict (dict): The dictionary containing the data.
-    Returns:
-        dict: The dictionary containing the extracted features.
    """
    start_time = time.time()
-    feature_data = {}
    failed_records = []

+    conn = sqlite3.connect('features.db')
+    c = conn.cursor()
+    # get last file in the database
+
+    try:
+        # print how many records are already in the database
+        c.execute('SELECT COUNT(*) FROM features')
+        print("Records in DB:", c.fetchall())
+
+        c = conn.cursor()
+        c.execute('SELECT id FROM features')
+        db_ids = c.fetchall()
+        db_ids = [x[0] for x in db_ids]
+    except:
+        db_ids = []
+        print("No last file in DB")
+
    for label, data in data_dict.items():
        print(f"Extracting features for {label} with {len(data)} data entries.")
        for data_idx, record in enumerate(data):
+            # Skip the records that are already in the database
+            if record.record_name in db_ids:
+                continue
+            # print current status
            if data_idx % 100 == 0:
                stop_time = time.time()
                print(f"Extracted features for {data_idx} records. Time taken: {stop_time - start_time:.2f}s") 
                start_time = time.time()
-
-            age = record.comments[0].split(' ')[1]
-            gender = record.comments[1].split(' ')[1]
-            if age == 'NaN' or gender == 'NaN':
-                continue
-            features = {}
-            # Extract the features
-            features['y'] = label
-            # Demographic features
-            features['age'] = int(age)
-            features['gender'] = True if gender == 'Male' else False
-            # Signal features
-
-            ecg_signal = record.p_signal[:, 0]
-            ecg_cleaned = nk.ecg_clean(ecg_signal, sampling_rate=sampling_rate)
-            _, rpeaks = nk.ecg_peaks(ecg_cleaned, sampling_rate=sampling_rate)
-            r_peaks = rpeaks['ECG_R_Peaks']
-            # Delineate the ECG signal
-            try:
-                _, waves_peak = nk.ecg_delineate(ecg_signal, r_peaks, sampling_rate=sampling_rate, method="peak")
-            except:
+            # get the features
+            features = get_features(record, label, sampling_rate=sampling_rate, used_channels=used_channels, conn=conn)
+            if features is None:
                failed_records.append(record.record_name)
                print(f"Failed to extract features for record {record.record_name} Sum of failed records: {len(failed_records)}")
                continue

-            # TODO Other features and check features
-            atrial_rate = len(waves_peak['ECG_P_Peaks']) * 6
-            ventricular_rate = np.mean(nk.ecg_rate(r_peaks, sampling_rate=sampling_rate, desired_length=len(ecg_cleaned)))

-            features['artial_rate'] = atrial_rate
-            features['ventricular_rate'] = ventricular_rate
-
-            qrs_duration = np.nanmean(np.array(waves_peak['ECG_S_Peaks']) - np.array(waves_peak['ECG_Q_Peaks']))
-            features['qrs_duration'] = qrs_duration
-
-            qt_interval = np.nanmean(np.array(waves_peak['ECG_T_Offsets']) - np.array(waves_peak['ECG_Q_Peaks']))
-            features['qt_length'] = qt_interval
-
-            q_peak = waves_peak['ECG_Q_Peaks']
-            s_peak = waves_peak['ECG_S_Peaks']
-            # check if q_peak, r_peak, s_peak are not nan and therefore a solid qrs complex exists
-            qrs_count = [any([math.isnan(q_peak[i]), math.isnan(r_peaks[i]), math.isnan(s_peak[i])]) for i in range(len(q_peak))].count(False)
-            features['qrs_count'] = qrs_count
-
-            features['q_peak'] = np.mean(get_y_value(ecg_cleaned, q_peak))
-
-            r_axis, t_axis = calculate_axis(record, waves_peak, r_peaks, sampling_rate=500, aVF=5, I=0)
-            features['r_axis'] = r_axis
-            features['t_axis'] = t_axis
-
-            # print the features
-            #print(json.dumps(features, indent=4))
-            feature_data[record.record_name] = features
-
-    return feature_data
+            # Define the feature names
+            feature_names = list(features.keys())
+            # Create the table if it doesn't exist
+            c.execute(f"CREATE TABLE IF NOT EXISTS features (id TEXT PRIMARY KEY, {', '.join(feature_names)})")
+            # Insert the record into the table
+            c.execute(f"INSERT INTO features (id, {', '.join(feature_names)}) VALUES (?, {', '.join('?' for _ in feature_names)})", 
+                    [record.record_name] + list(features.values()))
+            conn.commit()
+    conn.close()


-def split_data(feature_data, split_ratio):
+
+def split_and_shuffle_data(split_ratio, db=None):
+    """
+    Splits the data into training, test, and validation sets.
+    Args:
+        split_ratio (list): The ratio in which the data will be split into training, test, and validation sets.
+        db (str): The name of the database.
+    """
    print(f"Splitting data with ratio {split_ratio}")
-    #flatten dictionary
-    feature_data = {k: v for k, v in feature_data.items()}
-    # print keys
-    print("Keys:")
-    print(len(feature_data.keys()))
-    # shuffle the data
-    keys = list(feature_data.keys())
-    np.random.shuffle(keys)
-    # split the data
-    split_data = {}
-    split_data['train'] = {k: feature_data[k] for k in keys[:int(len(keys) * split_ratio[0])]}
-    split_data['test'] = {k: feature_data[k] for k in keys[int(len(keys) * split_ratio[0]):int(len(keys) * (split_ratio[0] + split_ratio[1]))]}
-    split_data['validation'] = {k: feature_data[k] for k in keys[int(len(keys) * (split_ratio[0] + split_ratio[1])):]}
    
-    return split_data
+    if db is None:
+        db = 'features.db'
+    conn = sqlite3.connect(db)
+    c = conn.cursor()
+    # shuffle the data (rows)
+    # Randomize the rows and create a new table with a new ROWID
+    c.execute("CREATE TABLE IF NOT EXISTS features_random AS SELECT * FROM features ORDER BY RANDOM()")
+    # Drop the old features table
+    c.execute("DROP TABLE features")
+    # Rename the new table to features
+    c.execute("ALTER TABLE features_random RENAME TO features")
+
+    # get length of the data
+    c.execute('SELECT COUNT(*) FROM features')
+    length = c.fetchone()[0]
+    # split the data into training, test, and validation sets
+    train_idx = int(length * split_ratio[0])
+    test_idx = int(length * (split_ratio[0] + split_ratio[1]))
+    # create 3 tables for training, test, and validation sets
+    # Create the train, test, and validation tables if they don't exist
+    c.execute("CREATE TABLE IF NOT EXISTS train AS SELECT * FROM features WHERE 0")
+    c.execute("CREATE TABLE IF NOT EXISTS test AS SELECT * FROM features WHERE 0")
+    c.execute("CREATE TABLE IF NOT EXISTS validation AS SELECT * FROM features WHERE 0")
+    # Insert rows into the train table
+    c.execute("INSERT INTO train SELECT * FROM features WHERE ROWID <= ?", (train_idx,))
+    c.execute("INSERT INTO test SELECT * FROM features WHERE ROWID > ? AND ROWID <= ?", (train_idx, test_idx,))
+    c.execute("INSERT INTO validation SELECT * FROM features WHERE ROWID > ?", (test_idx,))
+    
+    # drop the features table
+    # c.execute('DROP TABLE features')
+    conn.commit()
+    conn.close()
--- a/scripts/generate_data.py
+++ b/scripts/generate_data.py
@ -10,6 +10,7 @@ import os
 import numpy as np
 import pickle
 import json
+import multiprocessing

 import feature_extraction

@ -98,7 +99,7 @@ def write_data(data_dict, path='./data', file_prefix=''):
        with open(f'{path}/{file_prefix}{cat_name}.pkl', 'wb') as f:
            pickle.dump(data, f)

-def generate_feature_data(input_data_path, output_data_path, settings, prefix='feature_', split_ratio=None):
+def generate_feature_data(input_data_path, settings, parallel=False, split_ratio=None):
    """
    Generates the feature data from the raw data.
    Args:
@ -111,26 +112,28 @@ def generate_feature_data(input_data_path, output_data_path, settings, prefix='f
    """
    if split_ratio is None:
        split_ratio = settings['split_ratio']
-    data_dict = {}
+    print(list(os.listdir(input_data_path)))
    for file in os.listdir(input_data_path):
        if file.endswith(".pkl"):
            print(f"Reading {file}")
            with open(f'{input_data_path}/{file}', 'rb') as f:
                data = pickle.load(f)
-                data_dict[file.replace('.pkl', '')] = data
-    # Extract the features
-    feature_data = feature_extraction.extract_features(data_dict)
+                #data_dict[file.replace('.pkl', '')] = data
+                data_dict = {file.replace('.pkl', ''): data}
+            # Extract the features
+            if parallel:
+                # get max number of processes
+                max_processes = multiprocessing.cpu_count() - 1
+                print(f"Using {max_processes} processes to extract features.")
+                feature_extraction.extract_features_parallel(data_dict, num_processes=max_processes)
+            else:
+                feature_extraction.extract_features(data_dict)
    # Split the data
-    splited_data = feature_extraction.split_data(feature_data, split_ratio)
-    if not os.path.exists(f'{output_data_path}/ml_dataset/'):
-        os.makedirs(f'{output_data_path}/ml_dataset/')
-    for file_name, data in splited_data.items():
-        print(f"Writing {file_name} to pickle with {len(data)} data entries.")
-        with open(f'{output_data_path}/ml_dataset/{prefix}{file_name}', 'wb') as f:
-            pickle.dump(data, f)
-    return splited_data
+    feature_extraction.split_and_shuffle_data(split_ratio=split_ratio)

-def main(gen_data=True, gen_features=True, split_ratio=None, settings_path='./settings.json', num_process_files=-1):
+
+
+def main(gen_data=True, gen_features=True, split_ratio=None, parallel=False, settings_path='./settings.json', num_process_files=-1):
    """
    Main function to generate the data.
    Args:
@ -154,7 +157,7 @@ def main(gen_data=True, gen_features=True, split_ratio=None, settings_path='./se
        write_data(data_dict, path=settings["data_path"])
        ret_data = data_dict
    if gen_features:
-        feature_data_dict = generate_feature_data(settings["data_path"], settings["data_path"], settings, split_ratio=split_ratio)
+        feature_data_dict = generate_feature_data(settings["data_path"], settings, split_ratio=split_ratio, parallel=parallel)
        ret_data = feature_data_dict

    return ret_data
@ -176,5 +179,5 @@ if __name__ == '__main__':
    # new GSVT, AFIB, SR, SB
    # Generate the data
    main(gen_data=True, gen_features=False, num_process_files=100_000)
-    #main(gen_data=False, gen_features=True, split_ratio=[0.8, 0.1, 0.1], num_process_files=100_000)
+    #main(gen_data=False, gen_features=True, split_ratio=[0.8, 0.1, 0.1], parallel=False, num_process_files=100_000)
    print("Data generation completed.")