restructure project

2025-12-13 10:28:22 +01:00 · 2025-02-18 14:41:41 +01:00 · 2025-02-18 14:41:41 +01:00 · 052cf356c9
commit 052cf356c9
parent 019a426af4
14 changed files with 1220 additions and 826 deletions
--- a/Barite_50_Data.h5
+++ b/Barite_50_Data.h5
--- a/Barite_50_Data_inference.h5
+++ b/Barite_50_Data_inference.h5
--- a/Barite_50_Data_training.h5
+++ b/Barite_50_Data_training.h5
--- a/pycache/preprocessing.cpython-311.pyc
+++ b/pycache/preprocessing.cpython-311.pyc
--- a/barite_50_4_corner.h5
+++ b/barite_50_4_corner.h5
--- a/doc/measurement_plan.md
+++ b/doc/measurement_plan.md
@ -0,0 +1,24 @@
+# Measurement overview for model optimization
+
+### Parameters to optimize
+
+
+
+
+
+
+
+### Saved models
+
+`./results/model_large_standardization.keras`: Trained on `barite_50_4_corner.h5` dataset with extended Loss function (Huber loss with mass balance) and **standardized data**
+
+
+
+
+
+### Experiments
+
+| **Experiment** | **Dataset**           | **Model** | **Lossfunction** | **Activation** | **Preprocessing** |
+|----------------|-----------------------|-----------|------------------|----------------|-------------------|-------------------|
+| 1              | barite_50_4_corner.h5 | large     | Huber+dBa+dSa    | LeakuRelu      | Standardization   |
+| 2              | barite_50_4_corner.h5 | 
--- a/loss_1_to_end.png
+++ b/loss_1_to_end.png
--- a/loss_all.png
+++ b/loss_all.png
--- a/optuna_runs.py
+++ b/optuna_runs.py
@ -1,225 +0,0 @@
-import keras
-from keras.layers import Dense, Dropout, Input,BatchNormalization
-import tensorflow as tf
-import h5py
-import numpy as np
-import pandas as pd
-import time
-import sklearn.model_selection as sk
-import matplotlib.pyplot as plt
-from sklearn.cluster import KMeans
-from sklearn.pipeline import Pipeline, make_pipeline
-from sklearn.preprocessing import StandardScaler, MinMaxScaler
-from imblearn.over_sampling import SMOTE
-from imblearn.under_sampling import RandomUnderSampler
-from imblearn.over_sampling import RandomOverSampler
-from collections import Counter
-import os
-from preprocessing import *
-from sklearn import set_config
-from importlib import reload
-set_config(transform_output = "pandas")
-
-dtype = "float32"
-activation = "relu"
-
-lr = 0.001
-batch_size = 512
-epochs = 50 # default 400 epochs
-
-lr_schedule = keras.optimizers.schedules.ExponentialDecay(
-    initial_learning_rate=lr,
-    decay_steps=2000,
-    decay_rate=0.9,
-    staircase=True
-)
-
-optimizer_simple = keras.optimizers.Adam(learning_rate=lr_schedule)
-optimizer_large = keras.optimizers.Adam(learning_rate=lr_schedule)
-optimizer_paper = keras.optimizers.Adam(learning_rate=lr_schedule)
-
-sample_fraction = 0.8
-
-# small model
-model_simple = keras.Sequential(
-    [
-        keras.Input(shape = (9,), dtype = "float32"),
-        keras.layers.Dense(units = 128, activation = "linear", dtype = "float32"),
-        # Dropout(0.2),
-        keras.layers.Dense(units = 128, activation = "elu", dtype = "float32"),
-        keras.layers.Dense(units = 9, dtype = "float32")
-    ]
-)
-
-def Safelog(val):
-    # get range of vector
-    if val > 0:
-        return np.log10(val)
-    elif val < 0:
-        return -np.log10(-val)
-    else:
-        return 0
-
-def Safeexp(val):
-    if val > 0:
-        return -10 ** -val
-    elif val < 0:
-        return 10 ** val
-    else:
-        return 0
-    
-# ? Why does the charge is using another logarithm than the other species
-
-func_dict_in = {
-    "H" : np.log1p,
-    "O" : np.log1p,
-    "Charge" : Safelog,
-    "H_0_" : np.log1p,
-    "O_0_" : np.log1p,
-    "Ba" : np.log1p,
-    "Cl" : np.log1p,
-    "S_2_" : np.log1p,
-    "S_6_" : np.log1p,
-    "Sr" : np.log1p,
-    "Barite" : np.log1p,
-    "Celestite" : np.log1p,
-}
-
-func_dict_out = {
-    "H" : np.expm1,
-    "O" : np.expm1,
-    "Charge" : Safeexp,
-    "H_0_" : np.expm1,
-    "O_0_" : np.expm1,
-    "Ba" : np.expm1,
-    "Cl" : np.expm1,
-    "S_2_" : np.expm1,
-    "S_6_" : np.expm1,
-    "Sr" : np.expm1,
-    "Barite" : np.expm1,
-    "Celestite" : np.expm1,
-}
-
-# os.chdir('/mnt/beegfs/home/signer/projects/model-training')
-data_file = h5py.File("barite_50_4_corner.h5")
-
-design = data_file["design"]
-results = data_file["result"]
-
-df_design = pd.DataFrame(np.array(design["data"]).transpose(), columns = np.array(design["names"].asstr()))
-df_results = pd.DataFrame(np.array(results["data"]).transpose(), columns = np.array(results["names"].asstr()))
-
-data_file.close()
-
-species_columns = ['H', 'O', 'Charge', 'Ba', 'Cl', 'S', 'Sr', 'Barite', 'Celestite']
-
-preprocess = preprocessing(func_dict_in=func_dict_in, func_dict_out=func_dict_out)
-X, y = preprocess.cluster(df_design[species_columns], df_results[species_columns])
-# X, y = preprocess.funcTranform(X, y)
-
-X_train, X_test, y_train, y_test = preprocess.split(X, y, ratio = 0.2)
-X_train, y_train = preprocess.balancer(X_train, y_train, strategy = "over")
-preprocess.scale_fit(X_train, y_train, scaling = "individual")
-X_train, X_test, y_train, y_test = preprocess.scale_transform(X_train, X_test, y_train, y_test)
-X_train, X_val, y_train, y_val = preprocess.split(X_train, y_train, ratio = 0.1)
-
-column_dict = {"Ba": X.columns.get_loc("Ba"), "Barite":X.columns.get_loc("Barite"), "Sr":X.columns.get_loc("Sr"), "Celestite":X.columns.get_loc("Celestite"), "H":X.columns.get_loc("H"), "H":X.columns.get_loc("H"), "O":X.columns.get_loc("O")}
-
-def custom_loss(preprocess, column_dict, h1, h2, h3, h4):
-    # extract the scaling parameters
-    scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
-    min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32)
-    scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
-    min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32)
-
-    def loss(results, predicted):
-        # inverse min/max scaling
-        predicted_inverse = predicted * scale_X + min_X
-        results_inverse = results * scale_y + min_y
-
-        # mass balance
-        dBa = tf.keras.backend.abs(
-            (predicted_inverse[:, column_dict["Ba"]] + predicted_inverse[:, column_dict["Barite"]]) -
-            (results_inverse[:, column_dict["Ba"]] + results_inverse[:, column_dict["Barite"]])
-        )
-        dSr = tf.keras.backend.abs(
-            (predicted_inverse[:, column_dict["Sr"]] + predicted_inverse[:, column_dict["Celestite"]]) -
-            (results_inverse[:, column_dict["Sr"]] + results_inverse[:, column_dict["Celestite"]])
-        )
-        
-        # H/O ratio has to be 2
-        h2o_ratio = tf.keras.backend.abs(
-            (predicted_inverse[:, column_dict["H"]] / predicted_inverse[:, column_dict["O"]]) - 2
-        )
-
-        # huber loss
-        huber_loss = tf.keras.losses.Huber()(results, predicted)
-        
-        # total loss
-        total_loss = h1 * huber_loss + h2 * dBa**2 + h3 * dSr**2 #+ h4 * h2o_ratio**2
-
-        return total_loss
-
-    return loss
-
-def mass_balance(model, X, preprocess):
-    
-    # predict the chemistry
-    columns = X.iloc[:, X.columns != "Class"].columns
-    prediction = pd.DataFrame(model.predict(X[columns]), columns=columns)
-    
-    # backtransform min/max
-    X = pd.DataFrame(preprocess.scaler_X.inverse_transform(X.iloc[:, X.columns != "Class"]), columns=columns)
-    prediction = pd.DataFrame(preprocess.scaler_y.inverse_transform(prediction), columns=columns)
-        
-    # calculate mass balance    dBa = np.abs((prediction["Ba"] + prediction["Barite"]) - (X["Ba"] + X["Barite"]))
-    dSr = np.abs((prediction["Sr"] + prediction["Celestite"]) - (X["Sr"] + X["Celestite"]))
-    
-    return dBa + dSr
-
-import optuna
-
-def create_model(model, preprocess, h1, h2, h3, h4):
-    
-    model.compile(optimizer=optimizer_simple, loss=custom_loss(preprocess, column_dict, h1, h2, h3, h4))
-    
-    return model
-
-
-def objective(trial, preprocess, X_train, y_train, X_val, y_val, X_test, y_test):
-    h1 = trial.suggest_float("h1", 0.1, 10)
-    h2 = trial.suggest_float("h2", 0.1, 10)
-    h3 = trial.suggest_float("h3", 0.1, 10)
-    h4 = trial.suggest_float("h4", 0.1, 10)
-    
-    model = create_model(model_simple, preprocess, h1, h2, h3, h4)
-    
-    callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
-    history = model.fit(X_train.loc[:, X_train.columns != "Class"], 
-                        y_train.loc[:, y_train.columns != "Class"], 
-                        batch_size=batch_size, 
-                        epochs=50, 
-                        validation_data=(X_val.loc[:, X_val.columns != "Class"], y_val.loc[:, y_val.columns != "Class"]),
-                        callbacks=[callback])
-    
-    prediction_loss = model.evaluate(X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"])
-    mass_balance_results = mass_balance(model, X_test, preprocess)
-    
-    mass_balance_ratio = len(mass_balance_results[mass_balance_results < 1e-5]) / len(mass_balance_results)
-
-    return prediction_loss, mass_balance_ratio
-
-if __name__ == "__main__":
-    study  = optuna.create_study(storage="sqlite:///model_optimization.db", study_name="model_optimization", directions=["minimize", "maximize"])
-    study.optimize(lambda trial: objective(trial, preprocess, X_train, y_train, X_val, y_val, X_test, y_test), n_trials=1000)
-
-    print("Number of finished trials: ", len(study.trials))
-
-    print("Best trial:")
-    trial = study.best_trial
-
-    print("  Value: ", trial.value)
-
-    print("  Params: ")
-    for key, value in trial.params.items():
-        print("    {}: {}".format(key, value))
--- a/preprocessing.py
+++ b/preprocessing.py
@ -1,332 +0,0 @@
-import keras
-print("Running Keras in version {}".format(keras.__version__))
-
-import h5py
-import numpy as np
-import pandas as pd
-import time
-import sklearn.model_selection as sk
-import matplotlib.pyplot as plt
-from sklearn.cluster import KMeans
-from imblearn.over_sampling import SMOTE
-from imblearn.under_sampling import RandomUnderSampler
-from imblearn.over_sampling import RandomOverSampler
-from collections import Counter
-import os
-from sklearn.preprocessing import StandardScaler, MinMaxScaler
-from sklearn.base import clone
-
-# preprocessing pipeline
-# 
-
-def Safelog(val):
-    # get range of vector
-    if val > 0:
-        return np.log10(val)
-    elif val < 0:
-        return -np.log10(-val)
-    else:
-        return 0
-
-def Safeexp(val):
-    if val > 0:
-        return -10 ** -val
-    elif val < 0:
-        return 10 ** val
-    else:
-        return 0
-    
-
-class FuncTransform():
-    '''
-    Class to transform and inverse transform data with given functions.
-    Transform and inverse transform functions have to be given as dictionaries in the following format:
-    {'key1': function1, 'key2': function2, ...}
-    '''
-
-    def __init__(self, func_transform, func_inverse):
-        self.func_transform = func_transform
-        self.func_inverse = func_inverse
-
-    def fit(self, X, y=None):
-        return self
-    
-    def transform(self, X, y=None):
-        X = X.copy()
-        for key in X.keys():   
-            if "Class" not in key:
-                X[key] = X[key].apply(self.func_transform[key])
-        return X
-    
-    def fit_transform(self, X, y=None):
-        self.fit(X)
-        return self.transform(X, y)
-    
-    def inverse_transform(self, X_log):
-        X_log = X_log.copy()
-        for key in X_log.keys():
-            if "Class" not in key:
-                X_log[key] = X_log[key].apply(self.func_inverse[key])
-        return X_log
-    
-
-def clustering(X, n_clusters=2, random_state=42, x_length=50, y_length=50, species='Barite'):
-    '''
-    Function to cluster data with KMeans.
-    '''
-
-    class_labels = np.array([])
-    grid_length = x_length * y_length
-    iterations = int(len(X) / grid_length)
-
-    for i in range(0, iterations):
-        field = np.array(X[species][(i*grid_length):(i*grid_length+grid_length)]
-                         ).reshape(x_length, y_length)
-        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(
-            field.reshape(-1, 1))
-
-        class_labels = np.append(class_labels.astype(int), kmeans.labels_)
-        
-    if("Class" in X.columns):
-        print("Class column already exists")
-    else:
-        class_labels_df = pd.DataFrame(class_labels, columns=['Class'])
-        X_clustered = pd.concat([X, class_labels_df], axis=1)
-    
-    return X_clustered
-
-
-def balancer(design, target, strategy, sample_fraction=0.5):
-        
-    number_features = (design.columns != "Class").sum()
-    if("Class" not in design.columns):
-        if("Class" in target.columns):
-            classes = target['Class']
-        else:
-            raise Exception("No class column found")
-    else:
-        classes = design['Class']
-        counter = classes.value_counts()
-        print("Amount class 0 before:", counter[0] / (counter[0] + counter[1]) )
-        print("Amount class 1 before:", counter[1] / (counter[0] + counter[1]) )
-        df = pd.concat([design.loc[:,design.columns != "Class"], target.loc[:, target.columns != "Class"], classes], axis=1)
-        
-    if strategy == 'smote':
-        print("Using SMOTE strategy")
-        smote = SMOTE(sampling_strategy=sample_fraction)
-        df_resampled, classes_resampled = smote.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.columns == "Class"])
-        
-    elif strategy == 'over':
-        print("Using Oversampling")
-        over = RandomOverSampler()
-        df_resampled, classes_resampled = over.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.columns == "Class"])
-        
-    elif strategy == 'under':
-        print("Using Undersampling")
-        under = RandomUnderSampler()
-        df_resampled, classes_resampled = under.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.columns == "Class"])
-
-    else:
-        return design, target
-        
-    counter = classes_resampled["Class"].value_counts()
-    print("Amount class 0 after:", counter[0] / (counter[0] + counter[1]) )
-    print("Amount class 1 after:", counter[1] / (counter[0] + counter[1]) )
-    
-    design_resampled = pd.concat([df_resampled.iloc[:,0:number_features], classes_resampled], axis=1)
-    target_resampled = pd.concat([df_resampled.iloc[:,number_features:], classes_resampled], axis=1)
-    
-    return design_resampled, target_resampled  
-
-
-def plot_simulation(X, timestep, component='Barite', x_length=50, y_length=50):
-    grid_length = x_length * y_length
-    max_iter = int(len(X) / grid_length)
-    if(timestep >= max_iter):
-        raise Exception("timestep is not in the simulation range") 
-    
-    plt.imshow(np.array(X[component][(timestep*grid_length):(timestep*grid_length+grid_length)]).reshape(x_length,y_length), interpolation='bicubic', origin='lower')
-    
-    if("Class" in X.columns):
-        plt.contour(np.array(X['Class'][(timestep*grid_length):(timestep*grid_length+grid_length)]).reshape(x_length,y_length), levels=[0.1], colors='red', origin='lower')
-        
-    plt.show()
-    
-
-def preprocessing_training(df_design, df_targets, func_dict_in, func_dict_out, sampling, scaling, test_size):
-    
-    df_design = clustering(df_design)
-    df_targets = pd.concat([df_targets, df_design['Class']], axis=1)
-    
-    df_design_log = FuncTransform(func_dict_in, func_dict_out).fit_transform(df_design)
-    df_results_log = FuncTransform(func_dict_in, func_dict_out).fit_transform(df_targets)
-
-    X_train, X_test, y_train, y_test = sk.train_test_split(df_design_log, df_results_log, test_size = test_size, random_state=42)
-
-    X_train, y_train = balancer(X_train, y_train, sampling)
-    
-    scaler_X = MinMaxScaler()
-    scaler_y = MinMaxScaler()
-    
-    if scaling == 'individual':
-        scaler_X.fit(X_train.iloc[:, X_train.columns != "Class"])
-        scaler_y.fit(y_train.iloc[:, y_train.columns != "Class"])
-        
-    elif scaling == 'global':
-        scaler_X.fit(pd.concat([X_train.iloc[:, X_train.columns != "Class"], y_train.iloc[:, y_train.columns != "Class"]], axis=0))
-        scaler_y = scaler_X
-    
-    X_train = pd.concat([scaler_X.transform(X_train.loc[:, X_train.columns != "Class"]), X_train.loc[:, "Class"]], axis=1)
-    X_test = pd.concat([scaler_X.transform(X_test.loc[:, X_test.columns != "Class"]), X_test.loc[:, "Class"]], axis=1)
-
-    y_train = pd.concat([scaler_y.transform(y_train.loc[:, y_train.columns != "Class"]), y_train.loc[:, "Class"]], axis=1)
-    y_test = pd.concat([scaler_y.transform(y_test.loc[:, y_test.columns != "Class"]), y_test.loc[:, "Class"]], axis=1)
-    
-    X_train, X_val, y_train, y_val = sk.train_test_split(X_train, y_train, test_size = 0.1)
-    
-    return X_train, X_val, X_test, y_train, y_val, y_test, scaler_X, scaler_y
-    
-    
-    
-class preprocessing:
-
-    def __init__(self, func_dict_in, func_dict_out, random_state=42):
-        self.random_state = random_state
-        self.scaler_X = None
-        self.scaler_y = None
-        self.func_dict_in = func_dict_in
-        self.func_dict_out = func_dict_out
-        self.state = {"cluster": False, "log": False, "balance": False, "scale": False}
-        
-    def funcTranform(self, X, y):
-        for key in X.keys():   
-            if "Class" not in key:
-                X[key] = X[key].apply(self.func_dict_in[key])
-                y[key] = y[key].apply(self.func_dict_in[key])
-        self.state["log"] = True
-        
-        return X, y
-        
-    def funcInverse(self, X, y):
-    
-        for key in X.keys():
-            if "Class" not in key:
-                X[key] = X[key].apply(self.func_dict_out[key])
-                y[key] = y[key].apply(self.func_dict_out[key])
-        self.state["log"] = False
-        return X, y
-        
-    def cluster(self, X, y, species='Barite', n_clusters=2, x_length=50, y_length=50):
-        
-        class_labels = np.array([])
-        grid_length = x_length * y_length
-        iterations = int(len(X) / grid_length)
-
-        for i in range(0, iterations):
-            field = np.array(X[species][(i*grid_length):(i*grid_length+grid_length)]
-                         ).reshape(x_length, y_length)
-            kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state).fit(field.reshape(-1, 1))
-            class_labels = np.append(class_labels.astype(int), kmeans.labels_)
-
-        if ("Class" in X.columns and "Class" in y.columns):
-            print("Class column already exists")
-        else:
-            class_labels_df = pd.DataFrame(class_labels, columns=['Class'])
-            X = pd.concat([X, class_labels_df], axis=1)
-            y = pd.concat([y, class_labels_df], axis=1)
-            self.state["cluster"] = True
-            
-        return X, y
-            
-            
-    def balancer(self, X, y, strategy, sample_fraction=0.5):
-        
-        number_features = (X.columns != "Class").sum()
-        if("Class" not in X.columns):
-            if("Class" in y.columns):
-                classes = y['Class']
-            else:
-                raise Exception("No class column found")
-        else:
-            classes = X['Class']
-            counter = classes.value_counts()
-            print("Amount class 0 before:", counter[0] / (counter[0] + counter[1]) )
-            print("Amount class 1 before:", counter[1] / (counter[0] + counter[1]) )
-            df = pd.concat([X.loc[:,X.columns != "Class"], y.loc[:, y.columns != "Class"],  classes], axis=1)
-
-        if strategy == 'smote':
-            print("Using SMOTE strategy")
-            smote = SMOTE(sampling_strategy=sample_fraction)
-            df_resampled, classes_resampled = smote.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.    columns == "Class"])
-
-        elif strategy == 'over':
-            print("Using Oversampling")
-            over = RandomOverSampler()
-            df_resampled, classes_resampled = over.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df. columns == "Class"])
-
-        elif strategy == 'under':
-            print("Using Undersampling")
-            under = RandomUnderSampler()
-            df_resampled, classes_resampled = under.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.    columns == "Class"])
-
-        else:
-            return X, y
-
-        counter = classes_resampled["Class"].value_counts()
-        print("Amount class 0 after:", counter[0] / (counter[0] + counter[1]) )
-        print("Amount class 1 after:", counter[1] / (counter[0] + counter[1]) )
-
-        design_resampled = pd.concat([df_resampled.iloc[:,0:number_features], classes_resampled], axis=1)
-        target_resampled = pd.concat([df_resampled.iloc[:,number_features:], classes_resampled], axis=1)
-
-        self.state['balance'] = True
-        return design_resampled, target_resampled
-    
-    
-    def scale_fit(self, X, y, scaling):
-        
-        if scaling == 'individual':
-            self.scaler_X = MinMaxScaler()
-            self.scaler_y = MinMaxScaler()
-            self.scaler_X.fit(X.iloc[:, X.columns != "Class"])
-            self.scaler_y.fit(y.iloc[:, y.columns != "Class"])
-            
-        elif scaling == 'global':
-            self.scaler_X = MinMaxScaler()
-            self.scaler_X.fit(pd.concat([X.iloc[:, X.columns != "Class"], y.iloc[:, y.columns != "Class"]], axis=0))
-            self.scaler_y = self.scaler_X
-            
-        self.state['scale'] = True
-        
-    def scale_transform(self, X_train, X_test, y_train, y_test):
-        X_train = pd.concat([self.scaler_X.transform(X_train.loc[:, X_train.columns != "Class"]), X_train.loc[:, "Class"]], axis=1)
-        
-        X_test = pd.concat([self.scaler_X.transform(X_test.loc[:, X_test.columns != "Class"]), X_test.loc[:, "Class"]], axis=1)
-
-        y_train = pd.concat([self.scaler_y.transform(y_train.loc[:, y_train.columns != "Class"]), y_train.loc[:, "Class"]], axis=1)
-        
-        y_test = pd.concat([self.scaler_y.transform(y_test.loc[:, y_test.columns != "Class"]), y_test.loc[:, "Class"]], axis=1)
-        
-        return X_train, X_test, y_train, y_test
-    
-    def scale_inverse(self, X):
-        
-        if("Class" in X.columns):
-            X = pd.concat([self.scaler_X.inverse_transform(X.loc[:, X.columns != "Class"]), X.loc[:, "Class"]], axis=1)
-        else:
-            X = self.scaler_X.inverse_transform(X)  
-        
-        return X
-          
-    def split(self, X, y, ratio=0.8):
-        X_train, y_train, X_test, y_test = sk.train_test_split(X, y, test_size = ratio, random_state=self.random_state)
-        
-        return X_train, y_train, X_test, y_test
-    
-    
-
-
-
-
-
-   
--- a/src/POET_Training.ipynb
+++ b/src/POET_Training.ipynb
--- a/src/convert_data.jl
+++ b/src/convert_data.jl
--- a/src/optuna_runs.py
+++ b/src/optuna_runs.py
@ -0,0 +1,106 @@
+import keras
+from keras.layers import Dense, Dropout, Input,BatchNormalization
+import tensorflow as tf
+import h5py
+import numpy as np
+import pandas as pd
+import time
+import sklearn.model_selection as sk
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from imblearn.over_sampling import SMOTE
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import RandomOverSampler
+from collections import Counter
+import os
+from preprocessing import *
+from sklearn import set_config
+from importlib import reload
+set_config(transform_output = "pandas")
+import optuna
+import pickle
+
+data_file = h5py.File("../datasets/barite_50_4_corner.h5")
+
+def objective(trial, preprocess, X, y, species_columns):
+    
+    model_type = trial.suggest_categorical("model", ["simple", "large", "paper"])
+    scaler_type = trial.suggest_categorical("scaler", ["standard", "minmax"])
+    sampling_type = trial.suggest_categorical("sampling", ["over", "off"])
+    
+    preprocess = preprocessing()
+    X, y = preprocess.cluster(df_design[species_columns], df_results[species_columns])
+    X_train, X_test, y_train, y_test = preprocess.split(X, y, ratio = 0.2)
+    X_train, y_train = preprocess.balancer(X_train, y_train, strategy = sampling_type)
+    preprocess.scale_fit(X_train, y_train, scaling = "global", type=scaler_type)
+    X_train, X_test, y_train, y_test = preprocess.scale_transform(X_train, X_test, y_train, y_test)
+    X_train, X_val, y_train, y_val = preprocess.split(X_train, y_train, ratio = 0.1)
+
+    column_dict = {"Ba": X.columns.get_loc("Ba"), "Barite":X.columns.get_loc("Barite"), "Sr":X.columns.get_loc("Sr"), "Celestite":X.columns.get_loc("Celestite"), "H":X.columns.get_loc("H"), "H":X.columns.get_loc("H"), "O":X.columns.get_loc("O")}
+
+    h1 = trial.suggest_float("h1", 0.1, 1)
+    h2 = trial.suggest_float("h2", 0.1, 1)
+    h3 = trial.suggest_float("h3", 0.1, 1)
+    
+    
+    model = model_definition(model_type)
+    
+    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
+        initial_learning_rate=0.001,
+        decay_steps=2000,
+        decay_rate=0.9,
+        staircase=True
+    )
+    optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)
+    
+    model.compile(optimizer=optimizer, loss=custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type), metrics=[huber_metric(1.0), mass_balance_metric(preprocess, column_dict, scaler_type="minmax")])
+    
+    callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
+    history = model.fit(X_train.loc[:, X_train.columns != "Class"], 
+                        y_train.loc[:, y_train.columns != "Class"], 
+                        batch_size=512, 
+                        epochs=100, 
+                        validation_data=(X_val.loc[:, X_val.columns != "Class"], y_val.loc[:, y_val.columns != "Class"]),
+                        callbacks=[callback])
+    
+    prediction_loss = model.evaluate(X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"])
+    mass_balance_results = mass_balance_evaluation(model, X_test, preprocess)
+    
+    mass_balance_ratio = len(mass_balance_results[mass_balance_results < 1e-5]) / len(mass_balance_results)
+    
+    model_save_path_trial = os.path.join("../results/models/", f"model_trial_{trial.number}.h5")
+    history_save_path_trial = os.path.join("../results/history/", f"history_trial_{trial.number}.pkl")
+
+    model.save(model_save_path_trial)
+    with open(history_save_path_trial, 'wb') as f:
+        pickle.dump(history.history, f)
+
+    return prediction_loss, mass_balance_ratio
+
+if __name__ == "__main__":
+    
+    design = data_file["design"]
+    results = data_file["result"]
+
+    df_design = pd.DataFrame(np.array(design["data"]).transpose(), columns = np.array(design["names"].asstr()))
+    df_results = pd.DataFrame(np.array(results["data"]).transpose(), columns = np.array(results["names"].asstr()))
+
+    data_file.close()
+
+    species_columns = ['H', 'O', 'Charge', 'Ba', 'Cl', 'S', 'Sr', 'Barite', 'Celestite']
+    
+    study  = optuna.create_study(storage="sqlite:///model_optimization.db", study_name="model_optimization", directions=["minimize", "maximize"])
+    study.optimize(lambda trial: objective(trial, df_design, df_results, species_columns), n_trials=1000)
+
+    print("Number of finished trials: ", len(study.trials))
+
+    print("Best trial:")
+    trial = study.best_trial
+
+    print("  Value: ", trial.value)
+
+    print("  Params: ")
+    for key, value in trial.params.items():
+        print("    {}: {}".format(key, value))
--- a/src/preprocessing.py
+++ b/src/preprocessing.py
@ -0,0 +1,357 @@
+import keras
+from keras.layers import Dense, Dropout, Input,BatchNormalization, LeakyReLU
+import tensorflow as tf
+import h5py
+import numpy as np
+import pandas as pd
+import time
+import sklearn.model_selection as sk
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from imblearn.over_sampling import SMOTE
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import RandomOverSampler
+from collections import Counter
+import os
+from preprocessing import *
+from sklearn import set_config
+from importlib import reload
+set_config(transform_output = "pandas")
+
+# preprocessing pipeline
+# 
+
+def Safelog(val):
+    # get range of vector
+    if val > 0:
+        return np.log10(val)
+    elif val < 0:
+        return -np.log10(-val)
+    else:
+        return 0
+
+def Safeexp(val):
+    if val > 0:
+        return -10 ** -val
+    elif val < 0:
+        return 10 ** val
+    else:
+        return 0
+    
+    
+def model_definition(architecture):
+    dtype = "float32"
+
+    if architecture == "small":
+        model = keras.Sequential(
+            [
+                keras.Input(shape=(8,), dtype="float32"),
+                keras.layers.Dense(units=128, dtype="float32"),
+                LeakyReLU(alpha=0.01),
+                # Dropout(0.2),
+                keras.layers.Dense(units=128, dtype="float32"),
+                LeakyReLU(alpha=0.01),
+                keras.layers.Dense(units=8, dtype="float32")
+            ]
+        )
+    
+    
+    elif architecture == "large":
+        model = keras.Sequential(
+            [
+                keras.layers.Input(shape=(8,), dtype=dtype),
+                keras.layers.Dense(512, dtype=dtype),
+                LeakyReLU(alpha=0.01),
+                keras.layers.Dense(1024, dtype=dtype),
+                LeakyReLU(alpha=0.01),
+                keras.layers.Dense(512, dtype=dtype),
+                LeakyReLU(alpha=0.01),
+                keras.layers.Dense(8, dtype=dtype)
+            ]
+        )
+        
+    elif architecture == "paper":
+        model = keras.Sequential(
+            [keras.layers.Input(shape=(8,), dtype=dtype),
+             keras.layers.Dense(128, dtype=dtype),
+             LeakyReLU(alpha=0.01),
+             keras.layers.Dense(256, dtype=dtype),
+             LeakyReLU(alpha=0.01),
+             keras.layers.Dense(512, dtype=dtype),
+             LeakyReLU(alpha=0.01),
+             keras.layers.Dense(256, dtype=dtype),
+             LeakyReLU(alpha=0.01),
+             keras.layers.Dense(8, dtype=dtype)
+             ])
+        
+    return model
+    
+    
+def custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type="minmax"):
+    # extract the scaling parameters
+    
+    if scaler_type == "minmax":
+        scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
+        min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32)
+        scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
+        min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32)
+        
+    elif scaler_type == "standard":
+        scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
+        mean_X = tf.convert_to_tensor(preprocess.scaler_X.mean_, dtype=tf.float32)
+        scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
+        mean_y = tf.convert_to_tensor(preprocess.scaler_y.mean_, dtype=tf.float32)
+
+    def loss(results, predicted):
+        
+        # inverse min/max scaling
+        if scaler_type == "minmax":
+            predicted_inverse = predicted * scale_y + min_y
+            results_inverse = results * scale_X + min_X
+            
+        elif scaler_type == "standard":
+            predicted_inverse = predicted * scale_y + mean_y
+            results_inverse = results * scale_X + mean_X
+
+        # mass balance
+        dBa = tf.keras.backend.abs(
+            (predicted_inverse[:, column_dict["Ba"]] + predicted_inverse[:, column_dict["Barite"]]) -
+            (results_inverse[:, column_dict["Ba"]] + results_inverse[:, column_dict["Barite"]])
+        )
+        dSr = tf.keras.backend.abs(
+            (predicted_inverse[:, column_dict["Sr"]] + predicted_inverse[:, column_dict["Celestite"]]) -
+            (results_inverse[:, column_dict["Sr"]] + results_inverse[:, column_dict["Celestite"]])
+        )
+        
+        # H/O ratio has to be 2
+        # h2o_ratio = tf.keras.backend.abs(
+        #     (predicted_inverse[:, column_dict["H"]] / predicted_inverse[:, column_dict["O"]]) - 2
+        # )
+
+        # huber loss
+        huber_loss = tf.keras.losses.Huber()(results, predicted)
+        
+        # total loss
+        total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr #+ h4 * h2o_ratio
+        # total_loss = huber_loss
+        return total_loss
+
+    return loss
+
+def mass_balance_evaluation(model, X, preprocess):
+    
+    # predict the chemistry
+    columns = X.iloc[:, X.columns != "Class"].columns
+    prediction = pd.DataFrame(model.predict(X[columns]), columns=columns)
+    
+    # backtransform min/max or standard scaler
+    X = pd.DataFrame(preprocess.scaler_X.inverse_transform(X.iloc[:, X.columns != "Class"]), columns=columns)
+    prediction = pd.DataFrame(preprocess.scaler_y.inverse_transform(prediction), columns=columns)
+        
+    # calculate mass balance
+    dBa = np.abs((prediction["Ba"] + prediction["Barite"]) - (X["Ba"] + X["Barite"]))
+    print(dBa.min())
+    dSr = np.abs((prediction["Sr"] + prediction["Celestite"]) - (X["Sr"] + X["Celestite"]))
+    print(dSr.min())
+    return dBa, dSr, prediction
+
+
+def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
+    
+    if scaler_type == "minmax":
+        scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
+        min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32)
+        scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
+        min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32)
+
+    elif scaler_type == "standard":
+        scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
+        mean_X = tf.convert_to_tensor(preprocess.scaler_X.mean_, dtype=tf.float32)
+        scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
+        mean_y = tf.convert_to_tensor(preprocess.scaler_y.mean_, dtype=tf.float32)
+        
+        
+    def mass_balance(results, predicted):
+        # inverse min/max scaling
+        if scaler_type == "minmax":
+            predicted_inverse = predicted * scale_y + min_y
+            results_inverse = results * scale_X + min_X
+            
+        elif scaler_type == "standard":
+            predicted_inverse = predicted * scale_y + mean_y
+            results_inverse = results * scale_X + mean_X
+
+        # mass balance
+        dBa = tf.keras.backend.abs(
+            (predicted_inverse[:, column_dict["Ba"]] + predicted_inverse[:, column_dict["Barite"]]) -
+            (results_inverse[:, column_dict["Ba"]] + results_inverse[:, column_dict["Barite"]])
+        )
+        dSr = tf.keras.backend.abs(
+            (predicted_inverse[:, column_dict["Sr"]] + predicted_inverse[:, column_dict["Celestite"]]) -
+            (results_inverse[:, column_dict["Sr"]] + results_inverse[:, column_dict["Celestite"]])
+        )
+        
+        return tf.reduce_mean(dBa + dSr)
+    
+    return mass_balance
+
+
+def huber_metric(delta=1.0):
+    def huber(results, predicted):
+        return tf.keras.losses.huber(results, predicted, delta=delta)
+    
+    return huber
+    
+    
+class preprocessing:
+
+    def __init__(self, func_dict_in=None, func_dict_out=None, random_state=42):
+        self.random_state = random_state
+        self.scaler_X = None
+        self.scaler_y = None
+        self.func_dict_in = None
+        self.func_dict_in = func_dict_in if func_dict_in is not None else None
+        self.func_dict_out = func_dict_out if func_dict_out is not None else None
+        self.state = {"cluster": False, "log": False, "balance": False, "scale": False}
+        
+    def funcTranform(self, X, y):
+        for key in X.keys():   
+            if "Class" not in key:
+                X[key] = X[key].apply(self.func_dict_in[key])
+                y[key] = y[key].apply(self.func_dict_in[key])
+        self.state["log"] = True
+        
+        return X, y
+        
+    def funcInverse(self, X, y):
+    
+        for key in X.keys():
+            if "Class" not in key:
+                X[key] = X[key].apply(self.func_dict_out[key])
+                y[key] = y[key].apply(self.func_dict_out[key])
+        self.state["log"] = False
+        return X, y
+        
+    def cluster(self, X, y, species='Barite', n_clusters=2, x_length=50, y_length=50):
+        
+        class_labels = np.array([])
+        grid_length = x_length * y_length
+        iterations = int(len(X) / grid_length)
+
+        for i in range(0, iterations):
+            field = np.array(X[species][(i*grid_length):(i*grid_length+grid_length)]
+                         ).reshape(x_length, y_length)
+            kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state).fit(field.reshape(-1, 1))
+            class_labels = np.append(class_labels.astype(int), kmeans.labels_)
+
+        if ("Class" in X.columns and "Class" in y.columns):
+            print("Class column already exists")
+        else:
+            class_labels_df = pd.DataFrame(class_labels, columns=['Class'])
+            X = pd.concat([X, class_labels_df], axis=1)
+            y = pd.concat([y, class_labels_df], axis=1)
+            self.state["cluster"] = True
+            
+        return X, y
+            
+            
+    def balancer(self, X, y, strategy, sample_fraction=0.5):
+        
+        number_features = (X.columns != "Class").sum()
+        if("Class" not in X.columns):
+            if("Class" in y.columns):
+                classes = y['Class']
+            else:
+                raise Exception("No class column found")
+        else:
+            classes = X['Class']
+            counter = classes.value_counts()
+            print("Amount class 0 before:", counter[0] / (counter[0] + counter[1]) )
+            print("Amount class 1 before:", counter[1] / (counter[0] + counter[1]) )
+            df = pd.concat([X.loc[:,X.columns != "Class"], y.loc[:, y.columns != "Class"],  classes], axis=1)
+
+        if strategy == 'smote':
+            print("Using SMOTE strategy")
+            smote = SMOTE(sampling_strategy=sample_fraction)
+            df_resampled, classes_resampled = smote.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.    columns == "Class"])
+
+        elif strategy == 'over':
+            print("Using Oversampling")
+            over = RandomOverSampler()
+            df_resampled, classes_resampled = over.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df. columns == "Class"])
+
+        elif strategy == 'under':
+            print("Using Undersampling")
+            under = RandomUnderSampler()
+            df_resampled, classes_resampled = under.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.    columns == "Class"])
+
+        else:
+            return X, y
+
+        counter = classes_resampled["Class"].value_counts()
+        print("Amount class 0 after:", counter[0] / (counter[0] + counter[1]) )
+        print("Amount class 1 after:", counter[1] / (counter[0] + counter[1]) )
+
+        design_resampled = pd.concat([df_resampled.iloc[:,0:number_features], classes_resampled], axis=1)
+        target_resampled = pd.concat([df_resampled.iloc[:,number_features:], classes_resampled], axis=1)
+
+        self.state['balance'] = True
+        return design_resampled, target_resampled
+    
+    
+    def scale_fit(self, X, y, scaling, type='Standard'):
+        
+        if type == 'minmax':
+            self.scaler_X = MinMaxScaler()
+            self.scaler_y = MinMaxScaler()
+        elif type == 'standard':
+            self.scaler_X = StandardScaler()
+            self.scaler_y = StandardScaler()
+            
+        else:
+            raise Exception("No valid scaler type found")
+        
+        if scaling == 'individual':
+            self.scaler_X.fit(X.iloc[:, X.columns != "Class"])
+            self.scaler_y.fit(y.iloc[:, y.columns != "Class"])
+            
+        elif scaling == 'global':
+            self.scaler_X.fit(pd.concat([X.iloc[:, X.columns != "Class"], y.iloc[:, y.columns != "Class"]], axis=0))
+            self.scaler_y = self.scaler_X
+            
+        self.state['scale'] = True
+        
+    def scale_transform(self, X_train, X_test, y_train, y_test):
+        X_train = pd.concat([self.scaler_X.transform(X_train.loc[:, X_train.columns != "Class"]), X_train.loc[:, "Class"]], axis=1)
+        
+        X_test = pd.concat([self.scaler_X.transform(X_test.loc[:, X_test.columns != "Class"]), X_test.loc[:, "Class"]], axis=1)
+
+        y_train = pd.concat([self.scaler_y.transform(y_train.loc[:, y_train.columns != "Class"]), y_train.loc[:, "Class"]], axis=1)
+        
+        y_test = pd.concat([self.scaler_y.transform(y_test.loc[:, y_test.columns != "Class"]), y_test.loc[:, "Class"]], axis=1)
+        
+        return X_train, X_test, y_train, y_test
+    
+    def scale_inverse(self, X):
+        
+        if("Class" in X.columns):
+            X = pd.concat([self.scaler_X.inverse_transform(X.loc[:, X.columns != "Class"]), X.loc[:, "Class"]], axis=1)
+        else:
+            X = self.scaler_X.inverse_transform(X)  
+        
+        return X
+          
+    def split(self, X, y, ratio=0.8):
+        X_train, y_train, X_test, y_test = sk.train_test_split(X, y, test_size = ratio, random_state=self.random_state)
+        
+        return X_train, y_train, X_test, y_test
+    
+    
+
+
+
+
+
+