mirror of
https://git.gfz-potsdam.de/naaice/model-training.git
synced 2025-12-15 19:38:21 +01:00
225 lines
7.8 KiB
Python
225 lines
7.8 KiB
Python
import keras
|
|
from keras.layers import Dense, Dropout, Input,BatchNormalization
|
|
import tensorflow as tf
|
|
import h5py
|
|
import numpy as np
|
|
import pandas as pd
|
|
import time
|
|
import sklearn.model_selection as sk
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.pipeline import Pipeline, make_pipeline
|
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
from imblearn.over_sampling import SMOTE
|
|
from imblearn.under_sampling import RandomUnderSampler
|
|
from imblearn.over_sampling import RandomOverSampler
|
|
from collections import Counter
|
|
import os
|
|
from preprocessing import *
|
|
from sklearn import set_config
|
|
from importlib import reload
|
|
set_config(transform_output = "pandas")
|
|
|
|
dtype = "float32"
|
|
activation = "relu"
|
|
|
|
lr = 0.001
|
|
batch_size = 512
|
|
epochs = 50 # default 400 epochs
|
|
|
|
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
|
|
initial_learning_rate=lr,
|
|
decay_steps=2000,
|
|
decay_rate=0.9,
|
|
staircase=True
|
|
)
|
|
|
|
optimizer_simple = keras.optimizers.Adam(learning_rate=lr_schedule)
|
|
optimizer_large = keras.optimizers.Adam(learning_rate=lr_schedule)
|
|
optimizer_paper = keras.optimizers.Adam(learning_rate=lr_schedule)
|
|
|
|
sample_fraction = 0.8
|
|
|
|
# small model
|
|
model_simple = keras.Sequential(
|
|
[
|
|
keras.Input(shape = (9,), dtype = "float32"),
|
|
keras.layers.Dense(units = 128, activation = "linear", dtype = "float32"),
|
|
# Dropout(0.2),
|
|
keras.layers.Dense(units = 128, activation = "elu", dtype = "float32"),
|
|
keras.layers.Dense(units = 9, dtype = "float32")
|
|
]
|
|
)
|
|
|
|
def Safelog(val):
|
|
# get range of vector
|
|
if val > 0:
|
|
return np.log10(val)
|
|
elif val < 0:
|
|
return -np.log10(-val)
|
|
else:
|
|
return 0
|
|
|
|
def Safeexp(val):
|
|
if val > 0:
|
|
return -10 ** -val
|
|
elif val < 0:
|
|
return 10 ** val
|
|
else:
|
|
return 0
|
|
|
|
# ? Why does the charge is using another logarithm than the other species
|
|
|
|
func_dict_in = {
|
|
"H" : np.log1p,
|
|
"O" : np.log1p,
|
|
"Charge" : Safelog,
|
|
"H_0_" : np.log1p,
|
|
"O_0_" : np.log1p,
|
|
"Ba" : np.log1p,
|
|
"Cl" : np.log1p,
|
|
"S_2_" : np.log1p,
|
|
"S_6_" : np.log1p,
|
|
"Sr" : np.log1p,
|
|
"Barite" : np.log1p,
|
|
"Celestite" : np.log1p,
|
|
}
|
|
|
|
func_dict_out = {
|
|
"H" : np.expm1,
|
|
"O" : np.expm1,
|
|
"Charge" : Safeexp,
|
|
"H_0_" : np.expm1,
|
|
"O_0_" : np.expm1,
|
|
"Ba" : np.expm1,
|
|
"Cl" : np.expm1,
|
|
"S_2_" : np.expm1,
|
|
"S_6_" : np.expm1,
|
|
"Sr" : np.expm1,
|
|
"Barite" : np.expm1,
|
|
"Celestite" : np.expm1,
|
|
}
|
|
|
|
# os.chdir('/mnt/beegfs/home/signer/projects/model-training')
|
|
data_file = h5py.File("barite_50_4_corner.h5")
|
|
|
|
design = data_file["design"]
|
|
results = data_file["result"]
|
|
|
|
df_design = pd.DataFrame(np.array(design["data"]).transpose(), columns = np.array(design["names"].asstr()))
|
|
df_results = pd.DataFrame(np.array(results["data"]).transpose(), columns = np.array(results["names"].asstr()))
|
|
|
|
data_file.close()
|
|
|
|
species_columns = ['H', 'O', 'Charge', 'Ba', 'Cl', 'S', 'Sr', 'Barite', 'Celestite']
|
|
|
|
preprocess = preprocessing(func_dict_in=func_dict_in, func_dict_out=func_dict_out)
|
|
X, y = preprocess.cluster(df_design[species_columns], df_results[species_columns])
|
|
# X, y = preprocess.funcTranform(X, y)
|
|
|
|
X_train, X_test, y_train, y_test = preprocess.split(X, y, ratio = 0.2)
|
|
X_train, y_train = preprocess.balancer(X_train, y_train, strategy = "over")
|
|
preprocess.scale_fit(X_train, y_train, scaling = "individual")
|
|
X_train, X_test, y_train, y_test = preprocess.scale_transform(X_train, X_test, y_train, y_test)
|
|
X_train, X_val, y_train, y_val = preprocess.split(X_train, y_train, ratio = 0.1)
|
|
|
|
column_dict = {"Ba": X.columns.get_loc("Ba"), "Barite":X.columns.get_loc("Barite"), "Sr":X.columns.get_loc("Sr"), "Celestite":X.columns.get_loc("Celestite"), "H":X.columns.get_loc("H"), "H":X.columns.get_loc("H"), "O":X.columns.get_loc("O")}
|
|
|
|
def custom_loss(preprocess, column_dict, h1, h2, h3, h4):
|
|
# extract the scaling parameters
|
|
scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
|
|
min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32)
|
|
scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
|
|
min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32)
|
|
|
|
def loss(results, predicted):
|
|
# inverse min/max scaling
|
|
predicted_inverse = predicted * scale_X + min_X
|
|
results_inverse = results * scale_y + min_y
|
|
|
|
# mass balance
|
|
dBa = tf.keras.backend.abs(
|
|
(predicted_inverse[:, column_dict["Ba"]] + predicted_inverse[:, column_dict["Barite"]]) -
|
|
(results_inverse[:, column_dict["Ba"]] + results_inverse[:, column_dict["Barite"]])
|
|
)
|
|
dSr = tf.keras.backend.abs(
|
|
(predicted_inverse[:, column_dict["Sr"]] + predicted_inverse[:, column_dict["Celestite"]]) -
|
|
(results_inverse[:, column_dict["Sr"]] + results_inverse[:, column_dict["Celestite"]])
|
|
)
|
|
|
|
# H/O ratio has to be 2
|
|
h2o_ratio = tf.keras.backend.abs(
|
|
(predicted_inverse[:, column_dict["H"]] / predicted_inverse[:, column_dict["O"]]) - 2
|
|
)
|
|
|
|
# huber loss
|
|
huber_loss = tf.keras.losses.Huber()(results, predicted)
|
|
|
|
# total loss
|
|
total_loss = h1 * huber_loss + h2 * dBa**2 + h3 * dSr**2 #+ h4 * h2o_ratio**2
|
|
|
|
return total_loss
|
|
|
|
return loss
|
|
|
|
def mass_balance(model, X, preprocess):
|
|
|
|
# predict the chemistry
|
|
columns = X.iloc[:, X.columns != "Class"].columns
|
|
prediction = pd.DataFrame(model.predict(X[columns]), columns=columns)
|
|
|
|
# backtransform min/max
|
|
X = pd.DataFrame(preprocess.scaler_X.inverse_transform(X.iloc[:, X.columns != "Class"]), columns=columns)
|
|
prediction = pd.DataFrame(preprocess.scaler_y.inverse_transform(prediction), columns=columns)
|
|
|
|
# calculate mass balance dBa = np.abs((prediction["Ba"] + prediction["Barite"]) - (X["Ba"] + X["Barite"]))
|
|
dSr = np.abs((prediction["Sr"] + prediction["Celestite"]) - (X["Sr"] + X["Celestite"]))
|
|
|
|
return dBa + dSr
|
|
|
|
import optuna
|
|
|
|
def create_model(model, preprocess, h1, h2, h3, h4):
|
|
|
|
model.compile(optimizer=optimizer_simple, loss=custom_loss(preprocess, column_dict, h1, h2, h3, h4))
|
|
|
|
return model
|
|
|
|
|
|
def objective(trial, preprocess, X_train, y_train, X_val, y_val, X_test, y_test):
|
|
h1 = trial.suggest_float("h1", 0.1, 10)
|
|
h2 = trial.suggest_float("h2", 0.1, 10)
|
|
h3 = trial.suggest_float("h3", 0.1, 10)
|
|
h4 = trial.suggest_float("h4", 0.1, 10)
|
|
|
|
model = create_model(model_simple, preprocess, h1, h2, h3, h4)
|
|
|
|
callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
|
|
history = model.fit(X_train.loc[:, X_train.columns != "Class"],
|
|
y_train.loc[:, y_train.columns != "Class"],
|
|
batch_size=batch_size,
|
|
epochs=50,
|
|
validation_data=(X_val.loc[:, X_val.columns != "Class"], y_val.loc[:, y_val.columns != "Class"]),
|
|
callbacks=[callback])
|
|
|
|
prediction_loss = model.evaluate(X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"])
|
|
mass_balance_results = mass_balance(model, X_test, preprocess)
|
|
|
|
mass_balance_ratio = len(mass_balance_results[mass_balance_results < 1e-5]) / len(mass_balance_results)
|
|
|
|
return prediction_loss, mass_balance_ratio
|
|
|
|
if __name__ == "__main__":
|
|
study = optuna.create_study(storage="sqlite:///model_optimization.db", study_name="model_optimization", directions=["minimize", "maximize"])
|
|
study.optimize(lambda trial: objective(trial, preprocess, X_train, y_train, X_val, y_val, X_test, y_test), n_trials=1000)
|
|
|
|
print("Number of finished trials: ", len(study.trials))
|
|
|
|
print("Best trial:")
|
|
trial = study.best_trial
|
|
|
|
print(" Value: ", trial.value)
|
|
|
|
print(" Params: ")
|
|
for key, value in trial.params.items():
|
|
print(" {}: {}".format(key, value)) |