import keras from keras.layers import Dense, Dropout, Input,BatchNormalization import tensorflow as tf import h5py import numpy as np import pandas as pd import time import sklearn.model_selection as sk import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import StandardScaler, MinMaxScaler from imblearn.over_sampling import SMOTE from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import RandomOverSampler from collections import Counter import os from preprocessing import * from sklearn import set_config from importlib import reload set_config(transform_output = "pandas") dtype = "float32" activation = "relu" lr = 0.001 batch_size = 512 epochs = 50 # default 400 epochs lr_schedule = keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=lr, decay_steps=2000, decay_rate=0.9, staircase=True ) optimizer_simple = keras.optimizers.Adam(learning_rate=lr_schedule) optimizer_large = keras.optimizers.Adam(learning_rate=lr_schedule) optimizer_paper = keras.optimizers.Adam(learning_rate=lr_schedule) sample_fraction = 0.8 # small model model_simple = keras.Sequential( [ keras.Input(shape = (9,), dtype = "float32"), keras.layers.Dense(units = 128, activation = "linear", dtype = "float32"), # Dropout(0.2), keras.layers.Dense(units = 128, activation = "elu", dtype = "float32"), keras.layers.Dense(units = 9, dtype = "float32") ] ) def Safelog(val): # get range of vector if val > 0: return np.log10(val) elif val < 0: return -np.log10(-val) else: return 0 def Safeexp(val): if val > 0: return -10 ** -val elif val < 0: return 10 ** val else: return 0 # ? Why does the charge is using another logarithm than the other species func_dict_in = { "H" : np.log1p, "O" : np.log1p, "Charge" : Safelog, "H_0_" : np.log1p, "O_0_" : np.log1p, "Ba" : np.log1p, "Cl" : np.log1p, "S_2_" : np.log1p, "S_6_" : np.log1p, "Sr" : np.log1p, "Barite" : np.log1p, "Celestite" : np.log1p, } func_dict_out = { "H" : np.expm1, "O" : np.expm1, "Charge" : Safeexp, "H_0_" : np.expm1, "O_0_" : np.expm1, "Ba" : np.expm1, "Cl" : np.expm1, "S_2_" : np.expm1, "S_6_" : np.expm1, "Sr" : np.expm1, "Barite" : np.expm1, "Celestite" : np.expm1, } # os.chdir('/mnt/beegfs/home/signer/projects/model-training') data_file = h5py.File("barite_50_4_corner.h5") design = data_file["design"] results = data_file["result"] df_design = pd.DataFrame(np.array(design["data"]).transpose(), columns = np.array(design["names"].asstr())) df_results = pd.DataFrame(np.array(results["data"]).transpose(), columns = np.array(results["names"].asstr())) data_file.close() species_columns = ['H', 'O', 'Charge', 'Ba', 'Cl', 'S', 'Sr', 'Barite', 'Celestite'] preprocess = preprocessing(func_dict_in=func_dict_in, func_dict_out=func_dict_out) X, y = preprocess.cluster(df_design[species_columns], df_results[species_columns]) # X, y = preprocess.funcTranform(X, y) X_train, X_test, y_train, y_test = preprocess.split(X, y, ratio = 0.2) X_train, y_train = preprocess.balancer(X_train, y_train, strategy = "over") preprocess.scale_fit(X_train, y_train, scaling = "individual") X_train, X_test, y_train, y_test = preprocess.scale_transform(X_train, X_test, y_train, y_test) X_train, X_val, y_train, y_val = preprocess.split(X_train, y_train, ratio = 0.1) column_dict = {"Ba": X.columns.get_loc("Ba"), "Barite":X.columns.get_loc("Barite"), "Sr":X.columns.get_loc("Sr"), "Celestite":X.columns.get_loc("Celestite"), "H":X.columns.get_loc("H"), "H":X.columns.get_loc("H"), "O":X.columns.get_loc("O")} def custom_loss(preprocess, column_dict, h1, h2, h3, h4): # extract the scaling parameters scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32) min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32) scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32) min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32) def loss(results, predicted): # inverse min/max scaling predicted_inverse = predicted * scale_X + min_X results_inverse = results * scale_y + min_y # mass balance dBa = tf.keras.backend.abs( (predicted_inverse[:, column_dict["Ba"]] + predicted_inverse[:, column_dict["Barite"]]) - (results_inverse[:, column_dict["Ba"]] + results_inverse[:, column_dict["Barite"]]) ) dSr = tf.keras.backend.abs( (predicted_inverse[:, column_dict["Sr"]] + predicted_inverse[:, column_dict["Celestite"]]) - (results_inverse[:, column_dict["Sr"]] + results_inverse[:, column_dict["Celestite"]]) ) # H/O ratio has to be 2 h2o_ratio = tf.keras.backend.abs( (predicted_inverse[:, column_dict["H"]] / predicted_inverse[:, column_dict["O"]]) - 2 ) # huber loss huber_loss = tf.keras.losses.Huber()(results, predicted) # total loss total_loss = h1 * huber_loss + h2 * dBa**2 + h3 * dSr**2 #+ h4 * h2o_ratio**2 return total_loss return loss def mass_balance(model, X, preprocess): # predict the chemistry columns = X.iloc[:, X.columns != "Class"].columns prediction = pd.DataFrame(model.predict(X[columns]), columns=columns) # backtransform min/max X = pd.DataFrame(preprocess.scaler_X.inverse_transform(X.iloc[:, X.columns != "Class"]), columns=columns) prediction = pd.DataFrame(preprocess.scaler_y.inverse_transform(prediction), columns=columns) # calculate mass balance dBa = np.abs((prediction["Ba"] + prediction["Barite"]) - (X["Ba"] + X["Barite"])) dSr = np.abs((prediction["Sr"] + prediction["Celestite"]) - (X["Sr"] + X["Celestite"])) return dBa + dSr import optuna def create_model(model, preprocess, h1, h2, h3, h4): model.compile(optimizer=optimizer_simple, loss=custom_loss(preprocess, column_dict, h1, h2, h3, h4)) return model def objective(trial, preprocess, X_train, y_train, X_val, y_val, X_test, y_test): h1 = trial.suggest_float("h1", 0.1, 10) h2 = trial.suggest_float("h2", 0.1, 10) h3 = trial.suggest_float("h3", 0.1, 10) h4 = trial.suggest_float("h4", 0.1, 10) model = create_model(model_simple, preprocess, h1, h2, h3, h4) callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3) history = model.fit(X_train.loc[:, X_train.columns != "Class"], y_train.loc[:, y_train.columns != "Class"], batch_size=batch_size, epochs=50, validation_data=(X_val.loc[:, X_val.columns != "Class"], y_val.loc[:, y_val.columns != "Class"]), callbacks=[callback]) prediction_loss = model.evaluate(X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"]) mass_balance_results = mass_balance(model, X_test, preprocess) mass_balance_ratio = len(mass_balance_results[mass_balance_results < 1e-5]) / len(mass_balance_results) return prediction_loss, mass_balance_ratio if __name__ == "__main__": study = optuna.create_study(storage="sqlite:///model_optimization.db", study_name="model_optimization", directions=["minimize", "maximize"]) study.optimize(lambda trial: objective(trial, preprocess, X_train, y_train, X_val, y_val, X_test, y_test), n_trials=1000) print("Number of finished trials: ", len(study.trials)) print("Best trial:") trial = study.best_trial print(" Value: ", trial.value) print(" Params: ") for key, value in trial.params.items(): print(" {}: {}".format(key, value))