prepare optimization study

This commit is contained in:
Hannes Signer 2025-02-18 17:05:24 +01:00
parent 019a426af4
commit b58e8869ad
4 changed files with 3788 additions and 0 deletions

3206
src/POET_Training.ipynb Normal file

File diff suppressed because one or more lines are too long

60
src/convert_data.jl Normal file
View File

@ -0,0 +1,60 @@
using HDF5
using RData
using DataFrames
# Load Training Data
# train_data = load("Barite_50_Data.rds")
# training_h5_name = "Barite_50_Data.h5"
# h5open(training_h5_name, "w") do fid
# for key in keys(train_data)
# group = create_group(fid, key)
# group["names"] = names(train_data[key])
# group["data", compress=3] = Matrix(train_data[key])
# # group = create_group(fid, key)
# # grou["names"] = coln
# end
# end
# List all .rds files starting with "iter" in a given directory
rds_files = filter(x -> startswith(x, "iter"), readdir("barite_out/"))
# remove "iter_0.rds" from the list
rds_files = rds_files[2:end]
big_df_in = DataFrame()
big_df_out = DataFrame()
for rds_file in rds_files
# Load the RDS file
data = load("barite_out/$rds_file")
# Convert the data to a DataFrame
df_T = DataFrame(data["T"])
df_C = DataFrame(data["C"])
# Append the DataFrame to the big DataFrame
append!(big_df_in, df_T)
append!(big_df_out, df_C)
end
# remove ID, Barite_p1, Celestite_p1 columns
big_df_in = big_df_in[:, Not([:ID, :Barite_p1, :Celestite_p1])]
big_df_out = big_df_out[:, Not([:ID, :Barite_p1, :Celestite_p1])]
inference_h5_name = "Barite_50_Data_inference.h5"
h5open(inference_h5_name, "w") do fid
fid["names"] = names(big_df_in)
fid["data", compress=9] = Matrix(big_df_in)
end
training_h5_name = "Barite_50_Data_training.h5"
h5open(training_h5_name, "w") do fid
group_in = create_group(fid, "design")
group_out = create_group(fid, "result")
group_in["names"] = names(big_df_in)
group_in["data", compress=9] = Matrix(big_df_in)
group_out["names"] = names(big_df_out)
group_out["data", compress=9] = Matrix(big_df_out)
end

138
src/optuna_runs.py Normal file
View File

@ -0,0 +1,138 @@
import keras
from keras.layers import Dense, Dropout, Input,BatchNormalization
import tensorflow as tf
import h5py
import numpy as np
import pandas as pd
import time
import sklearn.model_selection as sk
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import os
from preprocessing import *
from sklearn import set_config
from importlib import reload
set_config(transform_output = "pandas")
import optuna
import pickle
def objective(trial, X, y, species_columns):
model_type = trial.suggest_categorical("model", ["small", "large", "paper"])
scaler_type = trial.suggest_categorical("scaler", ["standard", "minmax"])
sampling_type = trial.suggest_categorical("sampling", ["over", "off"])
loss_variant = trial.suggest_categorical("loss", ["huber", "huber_mass_balance"])
delta = trial.suggest_float("delta", 0.5, 5.0)
preprocess = preprocessing()
X, y = preprocess.cluster(df_design[species_columns], df_results[species_columns])
X_train, X_test, y_train, y_test = preprocess.split(X, y, ratio = 0.2)
X_train, y_train = preprocess.balancer(X_train, y_train, strategy = sampling_type)
preprocess.scale_fit(X_train, y_train, scaling = "global", type=scaler_type)
X_train, X_test, y_train, y_test = preprocess.scale_transform(X_train, X_test, y_train, y_test)
X_train, X_val, y_train, y_val = preprocess.split(X_train, y_train, ratio = 0.1)
column_dict = {"Ba": X.columns.get_loc("Ba"), "Barite": X.columns.get_loc("Barite"), "Sr": X.columns.get_loc(
"Sr"), "Celestite": X.columns.get_loc("Celestite"), "H": X.columns.get_loc("H"), "H": X.columns.get_loc("H"), "O": X.columns.get_loc("O")}
h1 = trial.suggest_float("h1", 0.1, 1.0)
h2 = trial.suggest_float("h2", 0.1, 1.0)
h3 = trial.suggest_float("h3", 0.1, 1.0)
model = model_definition(model_type)
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=0.001,
decay_steps=2000,
decay_rate=0.9,
staircase=True
)
optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)
model.compile(optimizer=optimizer, loss=custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type, loss_variant, delta),
metrics=[huber_metric(preprocess, scaler_type, delta), mass_balance_metric(preprocess, column_dict, scaler_type)])
callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
history = model.fit(X_train.loc[:, X_train.columns != "Class"],
y_train.loc[:, y_train.columns != "Class"],
batch_size=512,
epochs=100,
validation_data=(X_val.loc[:, X_val.columns != "Class"], y_val.loc[:, y_val.columns != "Class"]),
callbacks=[callback])
prediction_huber_overall = model.evaluate(
X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"])[1]
prediction_huber_non_reactive = model.evaluate(
X_test[X_test['Class'] == 0].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 0].iloc[:, y_test.columns != "Class"])[1]
prediction_huber_reactive = model.evaluate(
X_test[X_test['Class'] == 1].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 1].iloc[:, y_test.columns != "Class"])[1]
prediction_mass_balance_overall = model.evaluate(
X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"])[2]
prediction_mass_balance_non_reactive = model.evaluate(
X_test[X_test['Class'] == 0].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 0].iloc[:, y_test.columns != "Class"])[2]
prediction_mass_balance_reactive = model.evaluate(
X_test[X_test['Class'] == 1].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 1].iloc[:, y_test.columns != "Class"])[2]
mass_balance_results = mass_balance_evaluation(model, X_test, preprocess)
mass_balance_ratio = len(mass_balance_results[mass_balance_results < 1e-5]) / len(mass_balance_results)
results_save_path = os.path.join("./results/", "results.csv")
results_df = pd.DataFrame({
"trial": [trial.number],
"prediction_huber_overall": [prediction_huber_overall],
"prediction_huber_non_reactive": [prediction_huber_non_reactive],
"prediction_huber_reactive": [prediction_huber_reactive],
"prediction_mass_balance_overall": [prediction_mass_balance_overall],
"prediction_mass_balance_non_reactive": [prediction_mass_balance_non_reactive],
"prediction_mass_balance_reactive": [prediction_mass_balance_reactive]
})
if not os.path.isfile(results_save_path):
results_df.to_csv(results_save_path, index=False)
else:
results_df.to_csv(results_save_path, mode='a', header=False, index=False)
model_save_path_trial = os.path.join("./results/models/", f"model_trial_{trial.number}.keras")
history_save_path_trial = os.path.join("./results/history/", f"history_trial_{trial.number}.pkl")
model.save(model_save_path_trial)
with open(history_save_path_trial, 'wb') as f:
pickle.dump(history.history, f)
return prediction_huber_overall, mass_balance_ratio
if __name__ == "__main__":
print(os.path.abspath("./datasets/barite_50_4_corner.h5"))
data_file = h5py.File("./datasets/barite_50_4_corner.h5")
design = data_file["design"]
results = data_file["result"]
df_design = pd.DataFrame(np.array(design["data"]).transpose(), columns = np.array(design["names"].asstr()))
df_results = pd.DataFrame(np.array(results["data"]).transpose(), columns = np.array(results["names"].asstr()))
data_file.close()
species_columns = ['H', 'O', 'Ba', 'Cl', 'S', 'Sr', 'Barite', 'Celestite']
study = optuna.create_study(storage="sqlite:///model_large_optimization.db", study_name="model_optimization", directions=["minimize", "maximize"])
study.optimize(lambda trial: objective(trial, df_design, df_results, species_columns), n_trials=1000)
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
print(" {}: {}".format(key, value))

384
src/preprocessing.py Normal file
View File

@ -0,0 +1,384 @@
import keras
from keras.layers import Dense, Dropout, Input,BatchNormalization, LeakyReLU
import tensorflow as tf
import h5py
import numpy as np
import pandas as pd
import time
import sklearn.model_selection as sk
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import os
from preprocessing import *
from sklearn import set_config
from importlib import reload
set_config(transform_output = "pandas")
# preprocessing pipeline
#
def Safelog(val):
# get range of vector
if val > 0:
return np.log10(val)
elif val < 0:
return -np.log10(-val)
else:
return 0
def Safeexp(val):
if val > 0:
return -10 ** -val
elif val < 0:
return 10 ** val
else:
return 0
def model_definition(architecture):
dtype = "float32"
if architecture == "small":
model = keras.Sequential(
[
keras.Input(shape=(8,), dtype="float32"),
keras.layers.Dense(units=128, dtype="float32"),
LeakyReLU(negative_slope=0.01),
# Dropout(0.2),
keras.layers.Dense(units=128, dtype="float32"),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(units=8, dtype="float32")
]
)
elif architecture == "large":
model = keras.Sequential(
[
keras.layers.Input(shape=(8,), dtype=dtype),
keras.layers.Dense(512, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(1024, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(512, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(8, dtype=dtype)
]
)
elif architecture == "paper":
model = keras.Sequential(
[keras.layers.Input(shape=(8,), dtype=dtype),
keras.layers.Dense(128, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(256, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(512, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(256, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(8, dtype=dtype)
])
return model
def custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type="minmax", loss_variant="huber", delta=1.0):
# extract the scaling parameters
if scaler_type == "minmax":
scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32)
elif scaler_type == "standard":
scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
mean_X = tf.convert_to_tensor(preprocess.scaler_X.mean_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
mean_y = tf.convert_to_tensor(preprocess.scaler_y.mean_, dtype=tf.float32)
def loss(results, predicted):
# inverse min/max scaling
if scaler_type == "minmax":
predicted_inverse = predicted * scale_y + min_y
results_inverse = results * scale_X + min_X
elif scaler_type == "standard":
predicted_inverse = predicted * scale_y + mean_y
results_inverse = results * scale_X + mean_X
# mass balance
dBa = tf.keras.backend.abs(
(predicted_inverse[:, column_dict["Ba"]] + predicted_inverse[:, column_dict["Barite"]]) -
(results_inverse[:, column_dict["Ba"]] + results_inverse[:, column_dict["Barite"]])
)
dSr = tf.keras.backend.abs(
(predicted_inverse[:, column_dict["Sr"]] + predicted_inverse[:, column_dict["Celestite"]]) -
(results_inverse[:, column_dict["Sr"]] + results_inverse[:, column_dict["Celestite"]])
)
# H/O ratio has to be 2
# h2o_ratio = tf.keras.backend.abs(
# (predicted_inverse[:, column_dict["H"]] / predicted_inverse[:, column_dict["O"]]) - 2
# )
# huber loss
huber_loss = tf.keras.losses.Huber(delta)(results, predicted)
# total loss
if loss_variant == "huber":
total_loss = huber_loss
elif loss_variant == "huber_mass_balance":
total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr
return total_loss
return loss
def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
if scaler_type == "minmax":
scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32)
elif scaler_type == "standard":
scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
mean_X = tf.convert_to_tensor(preprocess.scaler_X.mean_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
mean_y = tf.convert_to_tensor(preprocess.scaler_y.mean_, dtype=tf.float32)
def mass_balance(results, predicted):
# inverse min/max scaling
if scaler_type == "minmax":
predicted_inverse = predicted * scale_y + min_y
results_inverse = results * scale_X + min_X
elif scaler_type == "standard":
predicted_inverse = predicted * scale_y + mean_y
results_inverse = results * scale_X + mean_X
# mass balance
dBa = tf.keras.backend.abs(
(predicted_inverse[:, column_dict["Ba"]] + predicted_inverse[:, column_dict["Barite"]]) -
(results_inverse[:, column_dict["Ba"]] + results_inverse[:, column_dict["Barite"]])
)
dSr = tf.keras.backend.abs(
(predicted_inverse[:, column_dict["Sr"]] + predicted_inverse[:, column_dict["Celestite"]]) -
(results_inverse[:, column_dict["Sr"]] + results_inverse[:, column_dict["Celestite"]])
)
return tf.reduce_mean(dBa + dSr)
return mass_balance
def huber_metric(preprocess, scaler_type="minmax", delta=1.0):
if scaler_type == "minmax":
scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32)
elif scaler_type == "standard":
scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
mean_X = tf.convert_to_tensor(preprocess.scaler_X.mean_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
mean_y = tf.convert_to_tensor(preprocess.scaler_y.mean_, dtype=tf.float32)
def huber(results, predicted):
if scaler_type == "minmax":
predicted_inverse = predicted * scale_y + min_y
results_inverse = results * scale_X + min_X
elif scaler_type == "standard":
predicted_inverse = predicted * scale_y + mean_y
results_inverse = results * scale_X + mean_X
huber_loss = tf.keras.losses.Huber(delta)(results, predicted)
return huber_loss
return huber
def mass_balance_evaluation(model, X, preprocess):
# predict the chemistry
columns = X.iloc[:, X.columns != "Class"].columns
prediction = pd.DataFrame(model.predict(X[columns]), columns=columns)
# backtransform min/max or standard scaler
X = pd.DataFrame(preprocess.scaler_X.inverse_transform(X.iloc[:, X.columns != "Class"]), columns=columns)
prediction = pd.DataFrame(preprocess.scaler_y.inverse_transform(prediction), columns=columns)
# calculate mass balance
dBa = np.abs((prediction["Ba"] + prediction["Barite"]) - (X["Ba"] + X["Barite"]))
print(dBa.min())
dSr = np.abs((prediction["Sr"] + prediction["Celestite"]) - (X["Sr"] + X["Celestite"]))
print(dSr.min())
return dBa+dSr
class preprocessing:
def __init__(self, func_dict_in=None, func_dict_out=None, random_state=42):
self.random_state = random_state
self.scaler_X = None
self.scaler_y = None
self.func_dict_in = None
self.func_dict_in = func_dict_in if func_dict_in is not None else None
self.func_dict_out = func_dict_out if func_dict_out is not None else None
self.state = {"cluster": False, "log": False, "balance": False, "scale": False}
def funcTranform(self, X, y):
for key in X.keys():
if "Class" not in key:
X[key] = X[key].apply(self.func_dict_in[key])
y[key] = y[key].apply(self.func_dict_in[key])
self.state["log"] = True
return X, y
def funcInverse(self, X, y):
for key in X.keys():
if "Class" not in key:
X[key] = X[key].apply(self.func_dict_out[key])
y[key] = y[key].apply(self.func_dict_out[key])
self.state["log"] = False
return X, y
def cluster(self, X, y, species='Barite', n_clusters=2, x_length=50, y_length=50):
class_labels = np.array([])
grid_length = x_length * y_length
iterations = int(len(X) / grid_length)
for i in range(0, iterations):
field = np.array(X[species][(i*grid_length):(i*grid_length+grid_length)]
).reshape(x_length, y_length)
kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state).fit(field.reshape(-1, 1))
class_labels = np.append(class_labels.astype(int), kmeans.labels_)
if ("Class" in X.columns and "Class" in y.columns):
print("Class column already exists")
else:
class_labels_df = pd.DataFrame(class_labels, columns=['Class'])
X = pd.concat([X, class_labels_df], axis=1)
y = pd.concat([y, class_labels_df], axis=1)
self.state["cluster"] = True
return X, y
def balancer(self, X, y, strategy, sample_fraction=0.5):
number_features = (X.columns != "Class").sum()
if("Class" not in X.columns):
if("Class" in y.columns):
classes = y['Class']
else:
raise Exception("No class column found")
else:
classes = X['Class']
counter = classes.value_counts()
print("Amount class 0 before:", counter[0] / (counter[0] + counter[1]) )
print("Amount class 1 before:", counter[1] / (counter[0] + counter[1]) )
df = pd.concat([X.loc[:,X.columns != "Class"], y.loc[:, y.columns != "Class"], classes], axis=1)
if strategy == 'smote':
print("Using SMOTE strategy")
smote = SMOTE(sampling_strategy=sample_fraction)
df_resampled, classes_resampled = smote.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df. columns == "Class"])
elif strategy == 'over':
print("Using Oversampling")
over = RandomOverSampler()
df_resampled, classes_resampled = over.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df. columns == "Class"])
elif strategy == 'under':
print("Using Undersampling")
under = RandomUnderSampler()
df_resampled, classes_resampled = under.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df. columns == "Class"])
else:
return X, y
counter = classes_resampled["Class"].value_counts()
print("Amount class 0 after:", counter[0] / (counter[0] + counter[1]) )
print("Amount class 1 after:", counter[1] / (counter[0] + counter[1]) )
design_resampled = pd.concat([df_resampled.iloc[:,0:number_features], classes_resampled], axis=1)
target_resampled = pd.concat([df_resampled.iloc[:,number_features:], classes_resampled], axis=1)
self.state['balance'] = True
return design_resampled, target_resampled
def scale_fit(self, X, y, scaling, type='Standard'):
if type == 'minmax':
self.scaler_X = MinMaxScaler()
self.scaler_y = MinMaxScaler()
elif type == 'standard':
self.scaler_X = StandardScaler()
self.scaler_y = StandardScaler()
else:
raise Exception("No valid scaler type found")
if scaling == 'individual':
self.scaler_X.fit(X.iloc[:, X.columns != "Class"])
self.scaler_y.fit(y.iloc[:, y.columns != "Class"])
elif scaling == 'global':
self.scaler_X.fit(pd.concat([X.iloc[:, X.columns != "Class"], y.iloc[:, y.columns != "Class"]], axis=0))
self.scaler_y = self.scaler_X
self.state['scale'] = True
def scale_transform(self, X_train, X_test, y_train, y_test):
X_train = pd.concat([self.scaler_X.transform(X_train.loc[:, X_train.columns != "Class"]), X_train.loc[:, "Class"]], axis=1)
X_test = pd.concat([self.scaler_X.transform(X_test.loc[:, X_test.columns != "Class"]), X_test.loc[:, "Class"]], axis=1)
y_train = pd.concat([self.scaler_y.transform(y_train.loc[:, y_train.columns != "Class"]), y_train.loc[:, "Class"]], axis=1)
y_test = pd.concat([self.scaler_y.transform(y_test.loc[:, y_test.columns != "Class"]), y_test.loc[:, "Class"]], axis=1)
return X_train, X_test, y_train, y_test
def scale_inverse(self, X):
if("Class" in X.columns):
X = pd.concat([self.scaler_X.inverse_transform(X.loc[:, X.columns != "Class"]), X.loc[:, "Class"]], axis=1)
else:
X = self.scaler_X.inverse_transform(X)
return X
def split(self, X, y, ratio=0.8):
X_train, y_train, X_test, y_test = sk.train_test_split(X, y, test_size = ratio, random_state=self.random_state)
return X_train, y_train, X_test, y_test