Compare commits

...

45 Commits

Author SHA1 Message Date
Hannes Signer
4fe35855f0 Merge branch 'main' of git.gfz-potsdam.de:naaice/model-training 2025-02-27 12:05:31 +01:00
Hannes Signer
91fd34bce0 Merge branch 'loss-experiment' 2025-02-27 12:04:13 +01:00
Hannes Martin Signer
1d9d193668 update julia script 2025-02-27 11:44:51 +01:00
Hannes Martin Signer
5fee700eae Merge branch 'log-scale-error' into 'loss-experiment'
Solve scaling error with min max scaling and restructure notebook as well as preprocessing functions

See merge request naaice/model-training!3
2025-02-27 11:42:05 +01:00
Hannes Signer
471038de50 update linting 2025-02-26 18:51:40 +01:00
Hannes Signer
5f4c863b39 first draft for students 2025-02-26 18:32:22 +01:00
Hannes Signer
331306c141 update mass_balance_evaluation 2025-02-26 18:32:11 +01:00
Hannes Signer
cce2f696a0 document functions 2025-02-26 12:57:39 +01:00
Hannes Signer
03918222c4 test with losses 2025-02-25 18:36:11 +01:00
Hannes Signer
0f7ee78a8a correction of scaling error in mass balance loss 2025-02-25 18:06:56 +01:00
Hannes Signer
8051eb3c3d investigate scaling error 2025-02-21 16:57:55 +01:00
Hannes Signer
c0ca00271e adapt notebook 2025-02-21 09:42:33 +01:00
Hannes Signer
5ffcd75adc update test samples 2025-02-19 17:58:47 +01:00
Hannes Signer
df7cabfb31 update notebook to new training structure 2025-02-18 20:47:24 +01:00
Hannes Signer
1381e9a879 add first models 2025-02-18 20:28:43 +01:00
Hannes Signer
7642d22d4d add 4 corner small dataset 2025-02-18 20:08:13 +01:00
Hannes Signer
3bf4ff42a0 add dataset 2025-02-18 20:06:23 +01:00
Hannes Signer
bf0dfcb690 delete cache 2025-02-18 19:55:13 +01:00
Hannes Signer
c447a6c84a Merge branch 'loss-experiment' of git.gfz-potsdam.de:naaice/model-training into loss-experiment 2025-02-18 17:16:16 +01:00
Hannes Signer
1a3787c530 add measurement plan 2025-02-18 17:08:50 +01:00
Hannes Signer
b58e8869ad prepare optimization study 2025-02-18 17:05:24 +01:00
Hannes Signer
052cf356c9 restructure project 2025-02-18 14:41:41 +01:00
Hannes Signer
019a426af4 update gitignore 2025-02-17 22:23:48 +01:00
Hannes Signer
4a86161cce test combined loss functions 2025-02-17 22:23:28 +01:00
Hannes Signer
986191f72c update training 2025-02-17 18:13:57 +01:00
Hannes Signer
d27e9e9c5d add optimizer framework 2025-02-17 00:11:36 +01:00
Hannes Signer
3171dd3643 custom loss function 2025-02-14 16:33:32 +01:00
Hannes Signer
e9f49308f2 further tests 2025-02-13 17:11:45 +01:00
Hannes Signer
4d7502c309 Merge branch 'loss-experiment' of git.gfz-potsdam.de:naaice/model-training into loss-experiment 2025-02-11 17:22:13 +01:00
Hannes Signer
a0145576f4 add new model architecture 2025-02-11 17:17:43 +01:00
Hannes Signer
3eb46f6c26 further experiments 2025-01-24 16:28:56 +01:00
Hannes Signer
67b1f23fa3 update preprocessing 2025-01-23 17:51:26 +01:00
Hannes Signer
78a57654a3 encapsulate functionality 2025-01-22 17:36:06 +01:00
Hannes Signer
a4c419c618 add template class 2025-01-21 17:36:41 +01:00
Hannes Signer
81d35796f8 add custom scaler steps 2025-01-20 19:40:57 +01:00
Hannes Signer
f0a4ddddc6 add new dataset 2025-01-20 10:25:27 +01:00
Hannes Signer
d517ca6540 add sampling function 2025-01-15 17:23:58 +01:00
Hannes Signer
bea094be35 update sampling methods 2025-01-15 16:04:14 +01:00
Hannes Signer
6872dc1aed add normal oversampling 2025-01-15 14:11:13 +01:00
Hannes Signer
78cd2f2083 add data conversion script 2025-01-15 12:02:54 +01:00
Hannes Signer
66fd10c19e add SMORT sampling 2025-01-15 12:01:22 +01:00
Hannes Signer
bb94da79d7 add data conversion script 2025-01-15 12:00:30 +01:00
Hannes Signer
4ee967f613 add SMOT sampling strategy 2025-01-15 11:50:05 +01:00
Hannes Signer
60f0165df1 add SMOT sampling strategy 2025-01-15 11:49:01 +01:00
Hannes Signer
f89228de06 add kmeans clustering 2025-01-14 17:48:41 +01:00
16 changed files with 1934 additions and 625 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
__pycache__/preprocessing.cpython-311.pyc

BIN
Barite_50_Data.h5 (Stored with Git LFS)

Binary file not shown.

BIN
Barite_50_Data_inference.h5 (Stored with Git LFS)

Binary file not shown.

File diff suppressed because one or more lines are too long

60
convert_data.jl Normal file
View File

@ -0,0 +1,60 @@
using HDF5
using RData
using DataFrames
# Load Training Data
# train_data = load("Barite_50_Data.rds")
# training_h5_name = "Barite_50_Data.h5"
# h5open(training_h5_name, "w") do fid
# for key in keys(train_data)
# group = create_group(fid, key)
# group["names"] = names(train_data[key])
# group["data", compress=3] = Matrix(train_data[key])
# # group = create_group(fid, key)
# # grou["names"] = coln
# end
# end
# List all .rds files starting with "iter" in a given directory
rds_files = filter(x -> startswith(x, "iter"), readdir("barite_out/"))
# remove "iter_0.rds" from the list
rds_files = rds_files[2:end]
big_df_in = DataFrame()
big_df_out = DataFrame()
for rds_file in rds_files
# Load the RDS file
data = load("barite_out/$rds_file")
# Convert the data to a DataFrame
df_T = DataFrame(data["T"])
df_C = DataFrame(data["C"])
# Append the DataFrame to the big DataFrame
append!(big_df_in, df_T)
append!(big_df_out, df_C)
end
# remove ID, Barite_p1, Celestite_p1 columns
big_df_in = big_df_in[:, Not([:ID, :Barite_p1, :Celestite_p1])]
big_df_out = big_df_out[:, Not([:ID, :Barite_p1, :Celestite_p1])]
inference_h5_name = "Barite_50_Data_inference.h5"
h5open(inference_h5_name, "w") do fid
fid["names"] = names(big_df_in)
fid["data", compress=9] = Matrix(big_df_in)
end
training_h5_name = "Barite_50_Data_training.h5"
h5open(training_h5_name, "w") do fid
group_in = create_group(fid, "design")
group_out = create_group(fid, "result")
group_in["names"] = names(big_df_in)
group_in["data", compress=9] = Matrix(big_df_in)
group_out["names"] = names(big_df_out)
group_out["data", compress=9] = Matrix(big_df_out)
end

2
datasets/.gitattributes vendored Normal file
View File

@ -0,0 +1,2 @@
Barite_50_Data_training.h5 filter=lfs diff=lfs merge=lfs -text
{file_name} filter=lfs diff=lfs merge=lfs -text

BIN
datasets/barite_50_4_corner.h5 (Stored with Git LFS) Normal file

Binary file not shown.

24
doc/measurement_plan.md Normal file
View File

@ -0,0 +1,24 @@
# Measurement overview for model optimization
### Parameters to optimize
### Saved models
`./results/model_large_standardization.keras`: Trained on `barite_50_4_corner.h5` dataset with extended Loss function (Huber loss with mass balance) and **standardized data**
### Experiments
| **Experiment** | **Dataset** | **Model** | **Lossfunction** | **Activation** | **Preprocessing** |
|----------------|-----------------------|-----------|------------------|----------------|-------------------|-------------------|
| 1 | barite_50_4_corner.h5 | large | Huber+dBa+dSa | LeakuRelu | Standardization |
| 2 | barite_50_4_corner.h5 |

BIN
model_large.keras Normal file

Binary file not shown.

BIN
results/models/model_large_minmax.keras (Stored with Git LFS) Normal file

Binary file not shown.

BIN
results/models/model_large_standardization.keras (Stored with Git LFS) Normal file

Binary file not shown.

952
src/POET_Training.ipynb Normal file

File diff suppressed because one or more lines are too long

88
src/convert_data.jl Normal file
View File

@ -0,0 +1,88 @@
#!/usr/bin/env julia
# qs_read = []
# # find all the files in 'barite_out'
# files = readdir("barite_out"; join=true)
# # remove files which do not have the extension '.qs2' and contains 'iter'
# files = filter(x -> occursin(r".*\.qs2", x) && occursin(r"iter", x), files)
# # remove first entry as it is iteration 0
# files = files[2:end]
# test1 = qs_read(files[1])
# @rput test1
# R"test1 <- test1$C"
# @rget test1
# check if ARGS contains 2 elements
if length(ARGS) != 2
println("Usage: julia convert.jl <directory> <output_file_name>.h5")
exit(1)
end
to_read_dir = ARGS[1]
output_file_name = ARGS[2] * ".h5"
# check if the directory exists
if !isdir(to_read_dir)
println("The directory \"$to_read_dir\" does not exist")
exit(1)
end
using HDF5, RCall, DataFrames
@rlibrary qs2
# List all .rds files starting with "iter" in a given directory
qs_files = filter(x -> occursin(r".*\.qs2", x) && occursin(r"iter", x), readdir(to_read_dir; join=true))[2:end]
df_design = DataFrame()
df_result = DataFrame()
for file in qs_files
# Load the RDS file
data = qs_read(file)
# get basename of the file
basename = split(file, "/")[end]
# get the iteration number by splitting the basename and parse the second element
iteration = parse(Int, split(split(basename, "_")[2], ".")[1])
@rput data
R"transport <- data$T"
R"chemistry <- data$C"
@rget transport
@rget chemistry
# Add iteration number to the DataFrame
transport.iteration = fill(iteration, size(transport, 1))
chemistry.iteration = fill(iteration, size(chemistry, 1))
# Append the DataFrame to the big DataFrame
append!(df_design, transport)
append!(df_result, chemistry)
end
# remove ID, Barite_p1, Celestite_p1 columns
df_design = df_design[:, Not([:ID, :Barite_p1, :Celestite_p1])]
df_result = df_result[:, Not([:ID, :Barite_p1, :Celestite_p1])]
h5open(output_file_name, "w") do fid
group_in = create_group(fid, "design")
group_out = create_group(fid, "result")
group_in["names"] = names(df_design)
group_in["data", compress=9] = Matrix(df_design)
group_out["names"] = names(df_result)
group_out["data", compress=9] = Matrix(df_result)
end

138
src/optuna_runs.py Normal file
View File

@ -0,0 +1,138 @@
import keras
from keras.layers import Dense, Dropout, Input,BatchNormalization
import tensorflow as tf
import h5py
import numpy as np
import pandas as pd
import time
import sklearn.model_selection as sk
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import os
from preprocessing import *
from sklearn import set_config
from importlib import reload
set_config(transform_output = "pandas")
import optuna
import pickle
def objective(trial, X, y, species_columns):
model_type = trial.suggest_categorical("model", ["small", "large", "paper"])
scaler_type = trial.suggest_categorical("scaler", ["standard", "minmax"])
sampling_type = trial.suggest_categorical("sampling", ["over", "off"])
loss_variant = trial.suggest_categorical("loss", ["huber", "huber_mass_balance"])
delta = trial.suggest_float("delta", 0.5, 5.0)
preprocess = preprocessing()
X, y = preprocess.cluster(df_design[species_columns], df_results[species_columns])
X_train, X_test, y_train, y_test = preprocess.split(X, y, ratio = 0.2)
X_train, y_train = preprocess.balancer(X_train, y_train, strategy = sampling_type)
preprocess.scale_fit(X_train, y_train, scaling = "global", type=scaler_type)
X_train, X_test, y_train, y_test = preprocess.scale_transform(X_train, X_test, y_train, y_test)
X_train, X_val, y_train, y_val = preprocess.split(X_train, y_train, ratio = 0.1)
column_dict = {"Ba": X.columns.get_loc("Ba"), "Barite": X.columns.get_loc("Barite"), "Sr": X.columns.get_loc(
"Sr"), "Celestite": X.columns.get_loc("Celestite"), "H": X.columns.get_loc("H"), "H": X.columns.get_loc("H"), "O": X.columns.get_loc("O")}
h1 = trial.suggest_float("h1", 0.1, 1.0)
h2 = trial.suggest_float("h2", 0.1, 1.0)
h3 = trial.suggest_float("h3", 0.1, 1.0)
model = model_definition(model_type)
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=0.001,
decay_steps=2000,
decay_rate=0.9,
staircase=True
)
optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)
model.compile(optimizer=optimizer, loss=custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type, loss_variant, delta),
metrics=[huber_metric(preprocess, scaler_type, delta), mass_balance_metric(preprocess, column_dict, scaler_type)])
callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
history = model.fit(X_train.loc[:, X_train.columns != "Class"],
y_train.loc[:, y_train.columns != "Class"],
batch_size=512,
epochs=100,
validation_data=(X_val.loc[:, X_val.columns != "Class"], y_val.loc[:, y_val.columns != "Class"]),
callbacks=[callback])
prediction_huber_overall = model.evaluate(
X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"])[1]
prediction_huber_non_reactive = model.evaluate(
X_test[X_test['Class'] == 0].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 0].iloc[:, y_test.columns != "Class"])[1]
prediction_huber_reactive = model.evaluate(
X_test[X_test['Class'] == 1].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 1].iloc[:, y_test.columns != "Class"])[1]
prediction_mass_balance_overall = model.evaluate(
X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"])[2]
prediction_mass_balance_non_reactive = model.evaluate(
X_test[X_test['Class'] == 0].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 0].iloc[:, y_test.columns != "Class"])[2]
prediction_mass_balance_reactive = model.evaluate(
X_test[X_test['Class'] == 1].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 1].iloc[:, y_test.columns != "Class"])[2]
mass_balance_results = mass_balance_evaluation(model, X_test, preprocess)
mass_balance_ratio = len(mass_balance_results[mass_balance_results < 1e-5]) / len(mass_balance_results)
results_save_path = os.path.join("./results/", "results.csv")
results_df = pd.DataFrame({
"trial": [trial.number],
"prediction_huber_overall": [prediction_huber_overall],
"prediction_huber_non_reactive": [prediction_huber_non_reactive],
"prediction_huber_reactive": [prediction_huber_reactive],
"prediction_mass_balance_overall": [prediction_mass_balance_overall],
"prediction_mass_balance_non_reactive": [prediction_mass_balance_non_reactive],
"prediction_mass_balance_reactive": [prediction_mass_balance_reactive]
})
if not os.path.isfile(results_save_path):
results_df.to_csv(results_save_path, index=False)
else:
results_df.to_csv(results_save_path, mode='a', header=False, index=False)
model_save_path_trial = os.path.join("./results/models/", f"model_trial_{trial.number}.keras")
history_save_path_trial = os.path.join("./results/history/", f"history_trial_{trial.number}.pkl")
model.save(model_save_path_trial)
with open(history_save_path_trial, 'wb') as f:
pickle.dump(history.history, f)
return prediction_huber_overall, mass_balance_ratio
if __name__ == "__main__":
print(os.path.abspath("./datasets/barite_50_4_corner.h5"))
data_file = h5py.File("./datasets/barite_50_4_corner.h5")
design = data_file["design"]
results = data_file["result"]
df_design = pd.DataFrame(np.array(design["data"]).transpose(), columns = np.array(design["names"].asstr()))
df_results = pd.DataFrame(np.array(results["data"]).transpose(), columns = np.array(results["names"].asstr()))
data_file.close()
species_columns = ['H', 'O', 'Ba', 'Cl', 'S', 'Sr', 'Barite', 'Celestite']
study = optuna.create_study(storage="sqlite:///model_large_optimization.db", study_name="model_optimization", directions=["minimize", "maximize"])
study.optimize(lambda trial: objective(trial, df_design, df_results, species_columns), n_trials=1000)
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
print(" {}: {}".format(key, value))

660
src/preprocessing.py Normal file
View File

@ -0,0 +1,660 @@
import keras
from keras.layers import Dense, Dropout, Input, BatchNormalization, LeakyReLU
import tensorflow as tf
import h5py
import numpy as np
import pandas as pd
import time
import sklearn.model_selection as sk
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
import os
from preprocessing import *
from sklearn import set_config
from importlib import reload
set_config(transform_output="pandas")
def model_definition(architecture):
"""Definition of the respective AI model. Three models are currently being analysed, which are labelled small, large or paper.
Args:
architecture (String): Choose between 'small', 'large' or 'paper'.
Returns:
keras model: Returns the respective model.
"""
dtype = "float32"
if architecture == "small":
model = keras.Sequential(
[
keras.Input(shape=(8,), dtype=dtype),
keras.layers.Dense(units=128, dtype=dtype),
LeakyReLU(negative_slope=0.01),
# Dropout(0.2),
keras.layers.Dense(units=128, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(units=8, dtype=dtype),
]
)
elif architecture == "large":
model = keras.Sequential(
[
keras.layers.Input(shape=(8,), dtype=dtype),
keras.layers.Dense(512, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(1024, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(512, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(8, dtype=dtype),
]
)
elif architecture == "paper":
model = keras.Sequential(
[
keras.layers.Input(shape=(8,), dtype=dtype),
keras.layers.Dense(128, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(256, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(512, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(256, dtype=dtype),
LeakyReLU(negative_slope=0.01),
keras.layers.Dense(8, dtype=dtype),
]
)
else:
raise Exception(
"No valid architecture found."
+ "Choose between 'small', 'large' or 'paper'."
)
return model
@keras.saving.register_keras_serializable()
def custom_loss(
preprocess,
column_dict,
h1,
h2,
h3,
scaler_type="minmax",
loss_variant="huber",
delta=1.0,
):
"""
Custom tensorflow loss function to combine Huber Loss with mass balance.
This is inspired by PINN (Physics Informed Neural Networks) where the loss function is a combination of the physics-based loss and the data-driven loss.
The mass balance is a physics-based loss that ensures the conservation of mass in the system.
A tensorflow loss function accepts only the two arguments y_true and y_pred. Therefore, a nested function is used to pass the additional arguments.
Args:
preprocess: preprocessing object
column_dict: dictionary with the column names as keys and the corresponding index as values.
(i.e {'H': 0, 'O': 1, 'Ba': 2, 'Cl': 3, 'S': 4, 'Sr': 5, 'Barite': 6, 'Celestite': 7})
h1: hyperparameter for the importance of the huber loss
h2: hyperparameter for the importance of the Barium mass balance term
h3: hyperparameter for the importance of the Strontium mass balance term
scaler_type: Normalization approach. Choose between "standard" and "minmax". Defaults to "minmax".
loss_variant: Loss function approach. Choose between "huber and "huber_mass_balance". Defaults to "huber".
delta: Hyperparameter for the Huber function threshold. Defaults to 1.0.
Returns:
loss function
"""
# as far as I know tensorflow does not directly support the use of scaler objects
# therefore, the backtransformation is done manually
try:
if scaler_type == "minmax":
scale_X = tf.convert_to_tensor(
preprocess.scaler_X.data_range_, dtype=tf.float32
)
min_X = tf.convert_to_tensor(
preprocess.scaler_X.data_min_, dtype=tf.float32
)
scale_y = tf.convert_to_tensor(
preprocess.scaler_y.data_range_, dtype=tf.float32
)
min_y = tf.convert_to_tensor(
preprocess.scaler_y.data_min_, dtype=tf.float32
)
elif scaler_type == "standard":
scale_X = tf.convert_to_tensor(
preprocess.scaler_X.scale_, dtype=tf.float32)
mean_X = tf.convert_to_tensor(
preprocess.scaler_X.mean_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(
preprocess.scaler_y.scale_, dtype=tf.float32)
mean_y = tf.convert_to_tensor(
preprocess.scaler_y.mean_, dtype=tf.float32)
else:
raise Exception(
"No valid scaler type found. Choose between 'standard' and 'minmax'."
)
except AttributeError:
raise Exception(
"Data normalized with scaler different than specified for the training. Compare the scaling approach on preprocessing and training."
)
def loss(results, predicted):
# inverse min/max scaling
if scaler_type == "minmax":
predicted_inverse = predicted * scale_y + min_y
results_inverse = results * scale_X + min_X
# inverse standard scaling
elif scaler_type == "standard":
predicted_inverse = predicted * scale_y + mean_y
results_inverse = results * scale_X + mean_X
# apply exp1m on the columns of predicted_inverse and results_inverse if log transformation was used
if preprocess.func_dict_out is not None:
predicted_inverse = tf.math.expm1(predicted_inverse)
results_inverse = tf.math.expm1(results_inverse)
# mass balance
# in total no Barium and Strontium should be lost in one simulation step
dBa = tf.keras.backend.abs(
(
predicted_inverse[:, column_dict["Ba"]]
+ predicted_inverse[:, column_dict["Barite"]]
)
- (
results_inverse[:, column_dict["Ba"]]
+ results_inverse[:, column_dict["Barite"]]
)
)
dSr = tf.keras.backend.abs(
(
predicted_inverse[:, column_dict["Sr"]]
+ predicted_inverse[:, column_dict["Celestite"]]
)
- (
results_inverse[:, column_dict["Sr"]]
+ results_inverse[:, column_dict["Celestite"]]
)
)
# huber loss
huber_loss = tf.keras.losses.Huber(delta)(results, predicted)
# total loss
if loss_variant == "huber":
total_loss = huber_loss
elif loss_variant == "huber_mass_balance":
total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr
else:
raise Exception(
"No valid loss variant found. Choose between 'huber' and 'huber_mass_balance'."
)
return total_loss
return loss
def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
"""Auxilary function to calculate the mass balance during training.
Args:
preprocess: preprocessing object
column_dict: dictionary with the column names as keys and the corresponding index as values
scaler_type: Normalization approach. Choose between "standard" and "minmax". Defaults to "minmax".
Returns:
mean of both mass balance terms
"""
if scaler_type == "minmax":
scale_X = tf.convert_to_tensor(
preprocess.scaler_X.data_range_, dtype=tf.float32
)
min_X = tf.convert_to_tensor(
preprocess.scaler_X.data_min_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(
preprocess.scaler_y.data_range_, dtype=tf.float32
)
min_y = tf.convert_to_tensor(
preprocess.scaler_y.data_min_, dtype=tf.float32)
elif scaler_type == "standard":
scale_X = tf.convert_to_tensor(
preprocess.scaler_X.scale_, dtype=tf.float32)
mean_X = tf.convert_to_tensor(
preprocess.scaler_X.mean_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(
preprocess.scaler_y.scale_, dtype=tf.float32)
mean_y = tf.convert_to_tensor(
preprocess.scaler_y.mean_, dtype=tf.float32)
def mass_balance(results, predicted):
# inverse min/max scaling
if scaler_type == "minmax":
predicted_inverse = predicted * scale_y + min_y
results_inverse = results * scale_X + min_X
elif scaler_type == "standard":
predicted_inverse = predicted * scale_y + mean_y
results_inverse = results * scale_X + mean_X
if preprocess.func_dict_out is not None:
predicted_inverse = tf.math.expm1(predicted_inverse)
results_inverse = tf.math.expm1(results_inverse)
# mass balance
dBa = tf.keras.backend.abs(
(
predicted_inverse[:, column_dict["Ba"]]
+ predicted_inverse[:, column_dict["Barite"]]
)
- (
results_inverse[:, column_dict["Ba"]]
+ results_inverse[:, column_dict["Barite"]]
)
)
dSr = tf.keras.backend.abs(
(
predicted_inverse[:, column_dict["Sr"]]
+ predicted_inverse[:, column_dict["Celestite"]]
)
- (
results_inverse[:, column_dict["Sr"]]
+ results_inverse[:, column_dict["Celestite"]]
)
)
return tf.reduce_mean(dBa + dSr)
return mass_balance
def huber_metric(delta=1.0):
"""Auxilary function to calculate the Huber loss during training.
Args:
preprocess (_type_): _description_
scaler_type (str, optional): _description_. Defaults to "minmax".
delta (float, optional): _description_. Defaults to 1.0.
"""
def huber(results, predicted):
huber_loss = tf.keras.losses.Huber(delta)(results, predicted)
return huber_loss
return huber
def mass_balance_evaluation(model, X, preprocess):
"""Calculates the mass balance difference for each cell.
Args:
model: trained model
X: data where the mass balance should be calculated
preprocess: preprocessing object
Returns:
vector with the mass balance difference for each cell
"""
# predict the chemistry
columns = X.iloc[:, X.columns != "Class"].columns
classes = X["Class"]
classes.reset_index(drop=True, inplace=True)
prediction = pd.DataFrame(model.predict(X[columns]), columns=columns)
# backtransform min/max or standard scaler
X = pd.DataFrame(
preprocess.scaler_X.inverse_transform(X.iloc[:, X.columns != "Class"]),
columns=columns,
)
prediction = pd.DataFrame(
preprocess.scaler_y.inverse_transform(prediction), columns=columns
)
# apply backtransformation if log transformation was applied
if preprocess.func_dict_out is not None:
X = preprocess.funcInverse(X)[0]
prediction = preprocess.funcInverse(prediction)[0]
# calculate mass balance
dBa = np.abs(
(prediction["Ba"] + prediction["Barite"]) - (X["Ba"] + X["Barite"]))
dSr = np.abs(
(prediction["Sr"] + prediction["Celestite"]) -
(X["Sr"] + X["Celestite"])
)
mass_balance_result = pd.DataFrame(
{"dBa": dBa, "dSr": dSr, "mass_balance": dBa + dSr, "Class": classes}
)
return mass_balance_result
def mass_balance_ratio(results, threshold=1e-5):
proportion = {}
mass_balance_threshold = results[results["mass_balance"] <= threshold]
overall = len(mass_balance_threshold)
class_0_amount = len(
mass_balance_threshold[mass_balance_threshold["Class"] == 0])
class_1_amount = len(
mass_balance_threshold[mass_balance_threshold["Class"] == 1])
proportion["overall"] = overall / len(results)
proportion["class_0"] = class_0_amount / \
len(results[results["Class"] == 0])
proportion["class_1"] = class_1_amount / \
len(results[results["Class"] == 1])
return proportion
class preprocessing:
"""
A class used to preprocess data for model training.
Attributes
"""
def __init__(self, func_dict_in=None, func_dict_out=None, random_state=42):
"""Initialization of the preprocessing object.
Args:
func_dict_in: function for transformation. Defaults to None.
func_dict_out: function for backtransformation. Defaults to None.
random_state (int, optional): Seed for reproducability. Defaults to 42.
"""
self.random_state = random_state
self.scaler_X = None
self.scaler_y = None
self.func_dict_in = func_dict_in if func_dict_in is not None else None
self.func_dict_out = func_dict_out if func_dict_out is not None else None
self.state = {"cluster": False, "log": False,
"balance": False, "scale": False}
def funcTranform(self, *args):
"""Apply the transformation function to the data columnwise.
Returns:
pandas data frame: transformed data
"""
for i in args:
for key in i.keys():
if "Class" not in key:
i[key] = i[key].apply(self.func_dict_in)
self.state["log"] = True
return args
def funcInverse(self, *args):
"""Apply the backtransformation function to the data columnwise.
Returns:
pandas data frame: backtransformed data
"""
for i in args:
for key in i.keys():
if "Class" not in key:
i[key] = i[key].apply(self.func_dict_out)
self.state["log"] = False
return args
def cluster(self, X, y, species="Barite", n_clusters=2, x_length=50, y_length=50):
"""Apply k-means clustering to the data to differentiate betweeen reactive and non-reactive cells.
Args:
X: design data set
y: target data set
species (str, optional): Chemical species to which clustering is be applied. Defaults to "Barite".
n_clusters (int, optional): Number of clusters. Defaults to 2.
x_length: x dimension of the grid. Defaults to 50.
y_length: y dimension of the grid. Defaults to 50.
Returns:
X, y dataframes with an additional column "Class" containing the cluster labels.
"""
class_labels = np.array([])
grid_length = x_length * y_length
iterations = int(len(X) / grid_length)
# calculate the cluster for each chemical iteration step
for i in range(0, iterations):
field = np.array(
X[species][(i * grid_length): (i * grid_length + grid_length)]
).reshape(x_length, y_length)
kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state).fit(
field.reshape(-1, 1)
)
class_labels = np.append(class_labels.astype(int), kmeans.labels_)
if "Class" in X.columns and "Class" in y.columns:
print("Class column already exists")
else:
class_labels_df = pd.DataFrame(class_labels, columns=["Class"])
X = pd.concat([X, class_labels_df], axis=1)
y = pd.concat([y, class_labels_df], axis=1)
self.state["cluster"] = True
return X, y
def balancer(self, X, y, strategy, sample_fraction=0.5):
"""Apply sampling strategies to balance the dataset.
Args:
X: design dataset (before the simulation)
y: target dataset (after the simulation)
strategy: Sampling strategy. Choose between "smote" (Synthetic Minority Oversampling Technique), "over" (Oversampling) and "under" (Undersampling).
sample_fraction (float, optional): Define balancer target. Specifies the target fraction of the minority class after the balancing step. Defaults to 0.5.
Returns:
X, y: resampled datasets
"""
number_features = (X.columns != "Class").sum()
if "Class" not in X.columns:
if "Class" in y.columns:
classes = y["Class"]
else:
raise Exception("No class column found")
else:
classes = X["Class"]
counter = classes.value_counts()
print("Amount class 0 before:",
counter[0] / (counter[0] + counter[1]))
print("Amount class 1 before:",
counter[1] / (counter[0] + counter[1]))
df = pd.concat(
[
X.loc[:, X.columns != "Class"],
y.loc[:, y.columns != "Class"],
classes,
],
axis=1,
)
if strategy == "smote":
print("Using SMOTE strategy")
smote = SMOTE(sampling_strategy=sample_fraction)
df_resampled, classes_resampled = smote.fit_resample(
df.loc[:, df.columns != "Class"], df.loc[:,
df.columns == "Class"]
)
elif strategy == "over":
print("Using Oversampling")
over = RandomOverSampler()
df_resampled, classes_resampled = over.fit_resample(
df.loc[:, df.columns != "Class"], df.loc[:,
df.columns == "Class"]
)
elif strategy == "under":
print("Using Undersampling")
under = RandomUnderSampler()
df_resampled, classes_resampled = under.fit_resample(
df.loc[:, df.columns != "Class"], df.loc[:,
df.columns == "Class"]
)
else:
print("No sampling selected. Output equals input.")
return X, y
counter = classes_resampled["Class"].value_counts()
print("Amount class 0 after:", counter[0] / (counter[0] + counter[1]))
print("Amount class 1 after:", counter[1] / (counter[0] + counter[1]))
design_resampled = pd.concat(
[df_resampled.iloc[:, 0:number_features], classes_resampled], axis=1
)
target_resampled = pd.concat(
[df_resampled.iloc[:, number_features:], classes_resampled], axis=1
)
self.state["balance"] = True
return design_resampled, target_resampled
def scale_fit(self, X, y, scaling, type="Standard"):
"""Fit a scaler for data preprocessing.
Args:
X: design dataset
y: target dataset
scaling: learn individual scaler for X and y when "individual" is selected or one global scaler on all data in X and y if "global" is selected (scaler_X and scaler_y are equal)
type (str, optional): Using MinMax Scaling or Standarization. Defaults to "Standard".
"""
if type == "minmax":
self.scaler_X = MinMaxScaler()
self.scaler_y = MinMaxScaler()
elif type == "standard":
self.scaler_X = StandardScaler()
self.scaler_y = StandardScaler()
else:
raise Exception("No valid scaler type found")
if scaling == "individual":
self.scaler_X.fit(X.iloc[:, X.columns != "Class"])
self.scaler_y.fit(y.iloc[:, y.columns != "Class"])
elif scaling == "global":
self.scaler_X.fit(
pd.concat(
[X.iloc[:, X.columns != "Class"],
y.iloc[:, y.columns != "Class"]],
axis=0,
)
)
self.scaler_y = self.scaler_X
self.state["scale"] = True
def scale_transform(self, X_train, X_test, y_train, y_test):
"""Apply learned scaler on datasets.
Args:
X_train: design training data
X_test: test training data
y_train: target training data
y_test: test target data
Returns:
transformed dataframes
"""
X_train = pd.concat(
[
self.scaler_X.transform(
X_train.loc[:, X_train.columns != "Class"]),
X_train.loc[:, "Class"],
],
axis=1,
)
X_test = pd.concat(
[
self.scaler_X.transform(
X_test.loc[:, X_test.columns != "Class"]),
X_test.loc[:, "Class"],
],
axis=1,
)
y_train = pd.concat(
[
self.scaler_y.transform(
y_train.loc[:, y_train.columns != "Class"]),
y_train.loc[:, "Class"],
],
axis=1,
)
y_test = pd.concat(
[
self.scaler_y.transform(
y_test.loc[:, y_test.columns != "Class"]),
y_test.loc[:, "Class"],
],
axis=1,
)
return X_train, X_test, y_train, y_test
def scale_inverse(self, *args):
"""Backtransform the dataset
Returns:
Backtransformed data frames
"""
result = []
for i in args:
if "Class" in i.columns:
inversed = pd.DataFrame(
self.scaler_X.inverse_transform(
i.loc[:, i.columns != "Class"]),
columns=i.columns[:-1],
)
class_column = i.loc[:, "Class"].reset_index(drop=True)
i = pd.concat([inversed, class_column], axis=1)
else:
i = pd.DataFrame(
self.scaler_X.inverse_transform(i), columns=i.columns)
result.append(i)
return result
def split(self, X, y, ratio=0.8):
X_train, y_train, X_test, y_test = sk.train_test_split(
X, y, test_size=ratio, random_state=self.random_state
)
return X_train, y_train, X_test, y_test
def class_selection(self, *args, class_label=0):
"""Select only rows with specific class label
Args:
Dataframes where rows with specific label should be selected. Defaults to 0.
Returns:
Elements with selected class label.
"""
for i in args:
i = i[i["Class"] == class_label]
return args