mirror of
https://git.gfz-potsdam.de/naaice/model-training.git
synced 2025-12-13 12:28:21 +01:00
Compare commits
45 Commits
5413be8610
...
4fe35855f0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4fe35855f0 | ||
|
|
91fd34bce0 | ||
|
|
1d9d193668 | ||
|
|
5fee700eae | ||
|
|
471038de50 | ||
|
|
5f4c863b39 | ||
|
|
331306c141 | ||
|
|
cce2f696a0 | ||
|
|
03918222c4 | ||
|
|
0f7ee78a8a | ||
|
|
8051eb3c3d | ||
|
|
c0ca00271e | ||
|
|
5ffcd75adc | ||
|
|
df7cabfb31 | ||
|
|
1381e9a879 | ||
|
|
7642d22d4d | ||
|
|
3bf4ff42a0 | ||
|
|
bf0dfcb690 | ||
|
|
c447a6c84a | ||
|
|
1a3787c530 | ||
|
|
b58e8869ad | ||
|
|
052cf356c9 | ||
|
|
019a426af4 | ||
|
|
4a86161cce | ||
|
|
986191f72c | ||
|
|
d27e9e9c5d | ||
|
|
3171dd3643 | ||
|
|
e9f49308f2 | ||
|
|
4d7502c309 | ||
|
|
a0145576f4 | ||
|
|
3eb46f6c26 | ||
|
|
67b1f23fa3 | ||
|
|
78a57654a3 | ||
|
|
a4c419c618 | ||
|
|
81d35796f8 | ||
|
|
f0a4ddddc6 | ||
|
|
d517ca6540 | ||
|
|
bea094be35 | ||
|
|
6872dc1aed | ||
|
|
78cd2f2083 | ||
|
|
66fd10c19e | ||
|
|
bb94da79d7 | ||
|
|
4ee967f613 | ||
|
|
60f0165df1 | ||
|
|
f89228de06 |
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
__pycache__/preprocessing.cpython-311.pyc
|
||||||
BIN
Barite_50_Data.h5
(Stored with Git LFS)
BIN
Barite_50_Data.h5
(Stored with Git LFS)
Binary file not shown.
BIN
Barite_50_Data_inference.h5
(Stored with Git LFS)
BIN
Barite_50_Data_inference.h5
(Stored with Git LFS)
Binary file not shown.
File diff suppressed because one or more lines are too long
60
convert_data.jl
Normal file
60
convert_data.jl
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
using HDF5
|
||||||
|
using RData
|
||||||
|
|
||||||
|
using DataFrames
|
||||||
|
|
||||||
|
# Load Training Data
|
||||||
|
# train_data = load("Barite_50_Data.rds")
|
||||||
|
|
||||||
|
# training_h5_name = "Barite_50_Data.h5"
|
||||||
|
# h5open(training_h5_name, "w") do fid
|
||||||
|
# for key in keys(train_data)
|
||||||
|
# group = create_group(fid, key)
|
||||||
|
# group["names"] = names(train_data[key])
|
||||||
|
# group["data", compress=3] = Matrix(train_data[key])
|
||||||
|
# # group = create_group(fid, key)
|
||||||
|
# # grou["names"] = coln
|
||||||
|
# end
|
||||||
|
# end
|
||||||
|
|
||||||
|
# List all .rds files starting with "iter" in a given directory
|
||||||
|
rds_files = filter(x -> startswith(x, "iter"), readdir("barite_out/"))
|
||||||
|
|
||||||
|
# remove "iter_0.rds" from the list
|
||||||
|
rds_files = rds_files[2:end]
|
||||||
|
|
||||||
|
big_df_in = DataFrame()
|
||||||
|
big_df_out = DataFrame()
|
||||||
|
|
||||||
|
for rds_file in rds_files
|
||||||
|
# Load the RDS file
|
||||||
|
data = load("barite_out/$rds_file")
|
||||||
|
# Convert the data to a DataFrame
|
||||||
|
df_T = DataFrame(data["T"])
|
||||||
|
df_C = DataFrame(data["C"])
|
||||||
|
# Append the DataFrame to the big DataFrame
|
||||||
|
append!(big_df_in, df_T)
|
||||||
|
append!(big_df_out, df_C)
|
||||||
|
end
|
||||||
|
|
||||||
|
# remove ID, Barite_p1, Celestite_p1 columns
|
||||||
|
big_df_in = big_df_in[:, Not([:ID, :Barite_p1, :Celestite_p1])]
|
||||||
|
big_df_out = big_df_out[:, Not([:ID, :Barite_p1, :Celestite_p1])]
|
||||||
|
|
||||||
|
inference_h5_name = "Barite_50_Data_inference.h5"
|
||||||
|
h5open(inference_h5_name, "w") do fid
|
||||||
|
fid["names"] = names(big_df_in)
|
||||||
|
fid["data", compress=9] = Matrix(big_df_in)
|
||||||
|
end
|
||||||
|
|
||||||
|
training_h5_name = "Barite_50_Data_training.h5"
|
||||||
|
h5open(training_h5_name, "w") do fid
|
||||||
|
group_in = create_group(fid, "design")
|
||||||
|
group_out = create_group(fid, "result")
|
||||||
|
|
||||||
|
group_in["names"] = names(big_df_in)
|
||||||
|
group_in["data", compress=9] = Matrix(big_df_in)
|
||||||
|
|
||||||
|
group_out["names"] = names(big_df_out)
|
||||||
|
group_out["data", compress=9] = Matrix(big_df_out)
|
||||||
|
end
|
||||||
2
datasets/.gitattributes
vendored
Normal file
2
datasets/.gitattributes
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
Barite_50_Data_training.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
{file_name} filter=lfs diff=lfs merge=lfs -text
|
||||||
BIN
datasets/barite_50_4_corner.h5
(Stored with Git LFS)
Normal file
BIN
datasets/barite_50_4_corner.h5
(Stored with Git LFS)
Normal file
Binary file not shown.
24
doc/measurement_plan.md
Normal file
24
doc/measurement_plan.md
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# Measurement overview for model optimization
|
||||||
|
|
||||||
|
### Parameters to optimize
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Saved models
|
||||||
|
|
||||||
|
`./results/model_large_standardization.keras`: Trained on `barite_50_4_corner.h5` dataset with extended Loss function (Huber loss with mass balance) and **standardized data**
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Experiments
|
||||||
|
|
||||||
|
| **Experiment** | **Dataset** | **Model** | **Lossfunction** | **Activation** | **Preprocessing** |
|
||||||
|
|----------------|-----------------------|-----------|------------------|----------------|-------------------|-------------------|
|
||||||
|
| 1 | barite_50_4_corner.h5 | large | Huber+dBa+dSa | LeakuRelu | Standardization |
|
||||||
|
| 2 | barite_50_4_corner.h5 |
|
||||||
BIN
model_large.keras
Normal file
BIN
model_large.keras
Normal file
Binary file not shown.
BIN
results/models/model_large_minmax.keras
(Stored with Git LFS)
Normal file
BIN
results/models/model_large_minmax.keras
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
results/models/model_large_standardization.keras
(Stored with Git LFS)
Normal file
BIN
results/models/model_large_standardization.keras
(Stored with Git LFS)
Normal file
Binary file not shown.
952
src/POET_Training.ipynb
Normal file
952
src/POET_Training.ipynb
Normal file
File diff suppressed because one or more lines are too long
88
src/convert_data.jl
Normal file
88
src/convert_data.jl
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
#!/usr/bin/env julia
|
||||||
|
|
||||||
|
# qs_read = []
|
||||||
|
|
||||||
|
|
||||||
|
# # find all the files in 'barite_out'
|
||||||
|
# files = readdir("barite_out"; join=true)
|
||||||
|
|
||||||
|
# # remove files which do not have the extension '.qs2' and contains 'iter'
|
||||||
|
# files = filter(x -> occursin(r".*\.qs2", x) && occursin(r"iter", x), files)
|
||||||
|
|
||||||
|
# # remove first entry as it is iteration 0
|
||||||
|
# files = files[2:end]
|
||||||
|
|
||||||
|
# test1 = qs_read(files[1])
|
||||||
|
|
||||||
|
# @rput test1
|
||||||
|
|
||||||
|
# R"test1 <- test1$C"
|
||||||
|
|
||||||
|
# @rget test1
|
||||||
|
|
||||||
|
# check if ARGS contains 2 elements
|
||||||
|
if length(ARGS) != 2
|
||||||
|
println("Usage: julia convert.jl <directory> <output_file_name>.h5")
|
||||||
|
exit(1)
|
||||||
|
end
|
||||||
|
|
||||||
|
to_read_dir = ARGS[1]
|
||||||
|
output_file_name = ARGS[2] * ".h5"
|
||||||
|
|
||||||
|
# check if the directory exists
|
||||||
|
if !isdir(to_read_dir)
|
||||||
|
println("The directory \"$to_read_dir\" does not exist")
|
||||||
|
exit(1)
|
||||||
|
end
|
||||||
|
|
||||||
|
using HDF5, RCall, DataFrames
|
||||||
|
@rlibrary qs2
|
||||||
|
|
||||||
|
# List all .rds files starting with "iter" in a given directory
|
||||||
|
qs_files = filter(x -> occursin(r".*\.qs2", x) && occursin(r"iter", x), readdir(to_read_dir; join=true))[2:end]
|
||||||
|
|
||||||
|
df_design = DataFrame()
|
||||||
|
df_result = DataFrame()
|
||||||
|
|
||||||
|
for file in qs_files
|
||||||
|
# Load the RDS file
|
||||||
|
data = qs_read(file)
|
||||||
|
|
||||||
|
# get basename of the file
|
||||||
|
basename = split(file, "/")[end]
|
||||||
|
|
||||||
|
# get the iteration number by splitting the basename and parse the second element
|
||||||
|
iteration = parse(Int, split(split(basename, "_")[2], ".")[1])
|
||||||
|
|
||||||
|
@rput data
|
||||||
|
|
||||||
|
R"transport <- data$T"
|
||||||
|
R"chemistry <- data$C"
|
||||||
|
|
||||||
|
@rget transport
|
||||||
|
@rget chemistry
|
||||||
|
|
||||||
|
# Add iteration number to the DataFrame
|
||||||
|
transport.iteration = fill(iteration, size(transport, 1))
|
||||||
|
chemistry.iteration = fill(iteration, size(chemistry, 1))
|
||||||
|
|
||||||
|
# Append the DataFrame to the big DataFrame
|
||||||
|
append!(df_design, transport)
|
||||||
|
append!(df_result, chemistry)
|
||||||
|
end
|
||||||
|
|
||||||
|
# remove ID, Barite_p1, Celestite_p1 columns
|
||||||
|
df_design = df_design[:, Not([:ID, :Barite_p1, :Celestite_p1])]
|
||||||
|
df_result = df_result[:, Not([:ID, :Barite_p1, :Celestite_p1])]
|
||||||
|
|
||||||
|
|
||||||
|
h5open(output_file_name, "w") do fid
|
||||||
|
group_in = create_group(fid, "design")
|
||||||
|
group_out = create_group(fid, "result")
|
||||||
|
|
||||||
|
group_in["names"] = names(df_design)
|
||||||
|
group_in["data", compress=9] = Matrix(df_design)
|
||||||
|
|
||||||
|
group_out["names"] = names(df_result)
|
||||||
|
group_out["data", compress=9] = Matrix(df_result)
|
||||||
|
end
|
||||||
138
src/optuna_runs.py
Normal file
138
src/optuna_runs.py
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
import keras
|
||||||
|
from keras.layers import Dense, Dropout, Input,BatchNormalization
|
||||||
|
import tensorflow as tf
|
||||||
|
import h5py
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import time
|
||||||
|
import sklearn.model_selection as sk
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
from sklearn.pipeline import Pipeline, make_pipeline
|
||||||
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
||||||
|
from imblearn.over_sampling import SMOTE
|
||||||
|
from imblearn.under_sampling import RandomUnderSampler
|
||||||
|
from imblearn.over_sampling import RandomOverSampler
|
||||||
|
from collections import Counter
|
||||||
|
import os
|
||||||
|
from preprocessing import *
|
||||||
|
from sklearn import set_config
|
||||||
|
from importlib import reload
|
||||||
|
set_config(transform_output = "pandas")
|
||||||
|
import optuna
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
def objective(trial, X, y, species_columns):
|
||||||
|
|
||||||
|
model_type = trial.suggest_categorical("model", ["small", "large", "paper"])
|
||||||
|
scaler_type = trial.suggest_categorical("scaler", ["standard", "minmax"])
|
||||||
|
sampling_type = trial.suggest_categorical("sampling", ["over", "off"])
|
||||||
|
loss_variant = trial.suggest_categorical("loss", ["huber", "huber_mass_balance"])
|
||||||
|
delta = trial.suggest_float("delta", 0.5, 5.0)
|
||||||
|
|
||||||
|
preprocess = preprocessing()
|
||||||
|
X, y = preprocess.cluster(df_design[species_columns], df_results[species_columns])
|
||||||
|
X_train, X_test, y_train, y_test = preprocess.split(X, y, ratio = 0.2)
|
||||||
|
X_train, y_train = preprocess.balancer(X_train, y_train, strategy = sampling_type)
|
||||||
|
preprocess.scale_fit(X_train, y_train, scaling = "global", type=scaler_type)
|
||||||
|
X_train, X_test, y_train, y_test = preprocess.scale_transform(X_train, X_test, y_train, y_test)
|
||||||
|
X_train, X_val, y_train, y_val = preprocess.split(X_train, y_train, ratio = 0.1)
|
||||||
|
|
||||||
|
column_dict = {"Ba": X.columns.get_loc("Ba"), "Barite": X.columns.get_loc("Barite"), "Sr": X.columns.get_loc(
|
||||||
|
"Sr"), "Celestite": X.columns.get_loc("Celestite"), "H": X.columns.get_loc("H"), "H": X.columns.get_loc("H"), "O": X.columns.get_loc("O")}
|
||||||
|
|
||||||
|
h1 = trial.suggest_float("h1", 0.1, 1.0)
|
||||||
|
h2 = trial.suggest_float("h2", 0.1, 1.0)
|
||||||
|
h3 = trial.suggest_float("h3", 0.1, 1.0)
|
||||||
|
|
||||||
|
model = model_definition(model_type)
|
||||||
|
|
||||||
|
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
|
||||||
|
initial_learning_rate=0.001,
|
||||||
|
decay_steps=2000,
|
||||||
|
decay_rate=0.9,
|
||||||
|
staircase=True
|
||||||
|
)
|
||||||
|
optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)
|
||||||
|
|
||||||
|
model.compile(optimizer=optimizer, loss=custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type, loss_variant, delta),
|
||||||
|
metrics=[huber_metric(preprocess, scaler_type, delta), mass_balance_metric(preprocess, column_dict, scaler_type)])
|
||||||
|
|
||||||
|
callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
|
||||||
|
history = model.fit(X_train.loc[:, X_train.columns != "Class"],
|
||||||
|
y_train.loc[:, y_train.columns != "Class"],
|
||||||
|
batch_size=512,
|
||||||
|
epochs=100,
|
||||||
|
validation_data=(X_val.loc[:, X_val.columns != "Class"], y_val.loc[:, y_val.columns != "Class"]),
|
||||||
|
callbacks=[callback])
|
||||||
|
|
||||||
|
prediction_huber_overall = model.evaluate(
|
||||||
|
X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"])[1]
|
||||||
|
prediction_huber_non_reactive = model.evaluate(
|
||||||
|
X_test[X_test['Class'] == 0].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 0].iloc[:, y_test.columns != "Class"])[1]
|
||||||
|
prediction_huber_reactive = model.evaluate(
|
||||||
|
X_test[X_test['Class'] == 1].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 1].iloc[:, y_test.columns != "Class"])[1]
|
||||||
|
prediction_mass_balance_overall = model.evaluate(
|
||||||
|
X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"])[2]
|
||||||
|
prediction_mass_balance_non_reactive = model.evaluate(
|
||||||
|
X_test[X_test['Class'] == 0].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 0].iloc[:, y_test.columns != "Class"])[2]
|
||||||
|
prediction_mass_balance_reactive = model.evaluate(
|
||||||
|
X_test[X_test['Class'] == 1].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 1].iloc[:, y_test.columns != "Class"])[2]
|
||||||
|
|
||||||
|
mass_balance_results = mass_balance_evaluation(model, X_test, preprocess)
|
||||||
|
mass_balance_ratio = len(mass_balance_results[mass_balance_results < 1e-5]) / len(mass_balance_results)
|
||||||
|
|
||||||
|
results_save_path = os.path.join("./results/", "results.csv")
|
||||||
|
results_df = pd.DataFrame({
|
||||||
|
"trial": [trial.number],
|
||||||
|
"prediction_huber_overall": [prediction_huber_overall],
|
||||||
|
"prediction_huber_non_reactive": [prediction_huber_non_reactive],
|
||||||
|
"prediction_huber_reactive": [prediction_huber_reactive],
|
||||||
|
"prediction_mass_balance_overall": [prediction_mass_balance_overall],
|
||||||
|
"prediction_mass_balance_non_reactive": [prediction_mass_balance_non_reactive],
|
||||||
|
"prediction_mass_balance_reactive": [prediction_mass_balance_reactive]
|
||||||
|
})
|
||||||
|
|
||||||
|
if not os.path.isfile(results_save_path):
|
||||||
|
results_df.to_csv(results_save_path, index=False)
|
||||||
|
else:
|
||||||
|
results_df.to_csv(results_save_path, mode='a', header=False, index=False)
|
||||||
|
|
||||||
|
|
||||||
|
model_save_path_trial = os.path.join("./results/models/", f"model_trial_{trial.number}.keras")
|
||||||
|
history_save_path_trial = os.path.join("./results/history/", f"history_trial_{trial.number}.pkl")
|
||||||
|
|
||||||
|
model.save(model_save_path_trial)
|
||||||
|
with open(history_save_path_trial, 'wb') as f:
|
||||||
|
pickle.dump(history.history, f)
|
||||||
|
|
||||||
|
return prediction_huber_overall, mass_balance_ratio
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
|
||||||
|
print(os.path.abspath("./datasets/barite_50_4_corner.h5"))
|
||||||
|
data_file = h5py.File("./datasets/barite_50_4_corner.h5")
|
||||||
|
design = data_file["design"]
|
||||||
|
results = data_file["result"]
|
||||||
|
|
||||||
|
df_design = pd.DataFrame(np.array(design["data"]).transpose(), columns = np.array(design["names"].asstr()))
|
||||||
|
df_results = pd.DataFrame(np.array(results["data"]).transpose(), columns = np.array(results["names"].asstr()))
|
||||||
|
|
||||||
|
data_file.close()
|
||||||
|
|
||||||
|
species_columns = ['H', 'O', 'Ba', 'Cl', 'S', 'Sr', 'Barite', 'Celestite']
|
||||||
|
|
||||||
|
study = optuna.create_study(storage="sqlite:///model_large_optimization.db", study_name="model_optimization", directions=["minimize", "maximize"])
|
||||||
|
study.optimize(lambda trial: objective(trial, df_design, df_results, species_columns), n_trials=1000)
|
||||||
|
|
||||||
|
print("Number of finished trials: ", len(study.trials))
|
||||||
|
|
||||||
|
print("Best trial:")
|
||||||
|
trial = study.best_trial
|
||||||
|
|
||||||
|
print(" Value: ", trial.value)
|
||||||
|
|
||||||
|
print(" Params: ")
|
||||||
|
for key, value in trial.params.items():
|
||||||
|
print(" {}: {}".format(key, value))
|
||||||
660
src/preprocessing.py
Normal file
660
src/preprocessing.py
Normal file
@ -0,0 +1,660 @@
|
|||||||
|
import keras
|
||||||
|
from keras.layers import Dense, Dropout, Input, BatchNormalization, LeakyReLU
|
||||||
|
import tensorflow as tf
|
||||||
|
import h5py
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import time
|
||||||
|
import sklearn.model_selection as sk
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
from sklearn.pipeline import Pipeline, make_pipeline
|
||||||
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
||||||
|
from imblearn.over_sampling import SMOTE
|
||||||
|
from imblearn.under_sampling import RandomUnderSampler
|
||||||
|
from imblearn.over_sampling import RandomOverSampler
|
||||||
|
from collections import Counter
|
||||||
|
import os
|
||||||
|
from preprocessing import *
|
||||||
|
from sklearn import set_config
|
||||||
|
from importlib import reload
|
||||||
|
|
||||||
|
set_config(transform_output="pandas")
|
||||||
|
|
||||||
|
|
||||||
|
def model_definition(architecture):
|
||||||
|
"""Definition of the respective AI model. Three models are currently being analysed, which are labelled ‘small’, ‘large’ or ‘paper’.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
architecture (String): Choose between 'small', 'large' or 'paper'.
|
||||||
|
Returns:
|
||||||
|
keras model: Returns the respective model.
|
||||||
|
"""
|
||||||
|
dtype = "float32"
|
||||||
|
|
||||||
|
if architecture == "small":
|
||||||
|
model = keras.Sequential(
|
||||||
|
[
|
||||||
|
keras.Input(shape=(8,), dtype=dtype),
|
||||||
|
keras.layers.Dense(units=128, dtype=dtype),
|
||||||
|
LeakyReLU(negative_slope=0.01),
|
||||||
|
# Dropout(0.2),
|
||||||
|
keras.layers.Dense(units=128, dtype=dtype),
|
||||||
|
LeakyReLU(negative_slope=0.01),
|
||||||
|
keras.layers.Dense(units=8, dtype=dtype),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
elif architecture == "large":
|
||||||
|
model = keras.Sequential(
|
||||||
|
[
|
||||||
|
keras.layers.Input(shape=(8,), dtype=dtype),
|
||||||
|
keras.layers.Dense(512, dtype=dtype),
|
||||||
|
LeakyReLU(negative_slope=0.01),
|
||||||
|
keras.layers.Dense(1024, dtype=dtype),
|
||||||
|
LeakyReLU(negative_slope=0.01),
|
||||||
|
keras.layers.Dense(512, dtype=dtype),
|
||||||
|
LeakyReLU(negative_slope=0.01),
|
||||||
|
keras.layers.Dense(8, dtype=dtype),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
elif architecture == "paper":
|
||||||
|
model = keras.Sequential(
|
||||||
|
[
|
||||||
|
keras.layers.Input(shape=(8,), dtype=dtype),
|
||||||
|
keras.layers.Dense(128, dtype=dtype),
|
||||||
|
LeakyReLU(negative_slope=0.01),
|
||||||
|
keras.layers.Dense(256, dtype=dtype),
|
||||||
|
LeakyReLU(negative_slope=0.01),
|
||||||
|
keras.layers.Dense(512, dtype=dtype),
|
||||||
|
LeakyReLU(negative_slope=0.01),
|
||||||
|
keras.layers.Dense(256, dtype=dtype),
|
||||||
|
LeakyReLU(negative_slope=0.01),
|
||||||
|
keras.layers.Dense(8, dtype=dtype),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise Exception(
|
||||||
|
"No valid architecture found."
|
||||||
|
+ "Choose between 'small', 'large' or 'paper'."
|
||||||
|
)
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@keras.saving.register_keras_serializable()
|
||||||
|
def custom_loss(
|
||||||
|
preprocess,
|
||||||
|
column_dict,
|
||||||
|
h1,
|
||||||
|
h2,
|
||||||
|
h3,
|
||||||
|
scaler_type="minmax",
|
||||||
|
loss_variant="huber",
|
||||||
|
delta=1.0,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Custom tensorflow loss function to combine Huber Loss with mass balance.
|
||||||
|
This is inspired by PINN (Physics Informed Neural Networks) where the loss function is a combination of the physics-based loss and the data-driven loss.
|
||||||
|
The mass balance is a physics-based loss that ensures the conservation of mass in the system.
|
||||||
|
A tensorflow loss function accepts only the two arguments y_true and y_pred. Therefore, a nested function is used to pass the additional arguments.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
preprocess: preprocessing object
|
||||||
|
column_dict: dictionary with the column names as keys and the corresponding index as values.
|
||||||
|
(i.e {'H': 0, 'O': 1, 'Ba': 2, 'Cl': 3, 'S': 4, 'Sr': 5, 'Barite': 6, 'Celestite': 7})
|
||||||
|
h1: hyperparameter for the importance of the huber loss
|
||||||
|
h2: hyperparameter for the importance of the Barium mass balance term
|
||||||
|
h3: hyperparameter for the importance of the Strontium mass balance term
|
||||||
|
scaler_type: Normalization approach. Choose between "standard" and "minmax". Defaults to "minmax".
|
||||||
|
loss_variant: Loss function approach. Choose between "huber and "huber_mass_balance". Defaults to "huber".
|
||||||
|
delta: Hyperparameter for the Huber function threshold. Defaults to 1.0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
loss function
|
||||||
|
"""
|
||||||
|
|
||||||
|
# as far as I know tensorflow does not directly support the use of scaler objects
|
||||||
|
# therefore, the backtransformation is done manually
|
||||||
|
try:
|
||||||
|
if scaler_type == "minmax":
|
||||||
|
scale_X = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_X.data_range_, dtype=tf.float32
|
||||||
|
)
|
||||||
|
min_X = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_X.data_min_, dtype=tf.float32
|
||||||
|
)
|
||||||
|
scale_y = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_y.data_range_, dtype=tf.float32
|
||||||
|
)
|
||||||
|
min_y = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_y.data_min_, dtype=tf.float32
|
||||||
|
)
|
||||||
|
|
||||||
|
elif scaler_type == "standard":
|
||||||
|
scale_X = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_X.scale_, dtype=tf.float32)
|
||||||
|
mean_X = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_X.mean_, dtype=tf.float32)
|
||||||
|
scale_y = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_y.scale_, dtype=tf.float32)
|
||||||
|
mean_y = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_y.mean_, dtype=tf.float32)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise Exception(
|
||||||
|
"No valid scaler type found. Choose between 'standard' and 'minmax'."
|
||||||
|
)
|
||||||
|
|
||||||
|
except AttributeError:
|
||||||
|
raise Exception(
|
||||||
|
"Data normalized with scaler different than specified for the training. Compare the scaling approach on preprocessing and training."
|
||||||
|
)
|
||||||
|
|
||||||
|
def loss(results, predicted):
|
||||||
|
# inverse min/max scaling
|
||||||
|
if scaler_type == "minmax":
|
||||||
|
predicted_inverse = predicted * scale_y + min_y
|
||||||
|
results_inverse = results * scale_X + min_X
|
||||||
|
|
||||||
|
# inverse standard scaling
|
||||||
|
elif scaler_type == "standard":
|
||||||
|
predicted_inverse = predicted * scale_y + mean_y
|
||||||
|
results_inverse = results * scale_X + mean_X
|
||||||
|
|
||||||
|
# apply exp1m on the columns of predicted_inverse and results_inverse if log transformation was used
|
||||||
|
if preprocess.func_dict_out is not None:
|
||||||
|
predicted_inverse = tf.math.expm1(predicted_inverse)
|
||||||
|
results_inverse = tf.math.expm1(results_inverse)
|
||||||
|
|
||||||
|
# mass balance
|
||||||
|
# in total no Barium and Strontium should be lost in one simulation step
|
||||||
|
dBa = tf.keras.backend.abs(
|
||||||
|
(
|
||||||
|
predicted_inverse[:, column_dict["Ba"]]
|
||||||
|
+ predicted_inverse[:, column_dict["Barite"]]
|
||||||
|
)
|
||||||
|
- (
|
||||||
|
results_inverse[:, column_dict["Ba"]]
|
||||||
|
+ results_inverse[:, column_dict["Barite"]]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
dSr = tf.keras.backend.abs(
|
||||||
|
(
|
||||||
|
predicted_inverse[:, column_dict["Sr"]]
|
||||||
|
+ predicted_inverse[:, column_dict["Celestite"]]
|
||||||
|
)
|
||||||
|
- (
|
||||||
|
results_inverse[:, column_dict["Sr"]]
|
||||||
|
+ results_inverse[:, column_dict["Celestite"]]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# huber loss
|
||||||
|
huber_loss = tf.keras.losses.Huber(delta)(results, predicted)
|
||||||
|
|
||||||
|
# total loss
|
||||||
|
if loss_variant == "huber":
|
||||||
|
total_loss = huber_loss
|
||||||
|
elif loss_variant == "huber_mass_balance":
|
||||||
|
total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr
|
||||||
|
else:
|
||||||
|
raise Exception(
|
||||||
|
"No valid loss variant found. Choose between 'huber' and 'huber_mass_balance'."
|
||||||
|
)
|
||||||
|
|
||||||
|
return total_loss
|
||||||
|
|
||||||
|
return loss
|
||||||
|
|
||||||
|
|
||||||
|
def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
|
||||||
|
"""Auxilary function to calculate the mass balance during training.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
preprocess: preprocessing object
|
||||||
|
column_dict: dictionary with the column names as keys and the corresponding index as values
|
||||||
|
scaler_type: Normalization approach. Choose between "standard" and "minmax". Defaults to "minmax".
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
mean of both mass balance terms
|
||||||
|
"""
|
||||||
|
|
||||||
|
if scaler_type == "minmax":
|
||||||
|
scale_X = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_X.data_range_, dtype=tf.float32
|
||||||
|
)
|
||||||
|
min_X = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_X.data_min_, dtype=tf.float32)
|
||||||
|
scale_y = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_y.data_range_, dtype=tf.float32
|
||||||
|
)
|
||||||
|
min_y = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_y.data_min_, dtype=tf.float32)
|
||||||
|
|
||||||
|
elif scaler_type == "standard":
|
||||||
|
scale_X = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_X.scale_, dtype=tf.float32)
|
||||||
|
mean_X = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_X.mean_, dtype=tf.float32)
|
||||||
|
scale_y = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_y.scale_, dtype=tf.float32)
|
||||||
|
mean_y = tf.convert_to_tensor(
|
||||||
|
preprocess.scaler_y.mean_, dtype=tf.float32)
|
||||||
|
|
||||||
|
def mass_balance(results, predicted):
|
||||||
|
# inverse min/max scaling
|
||||||
|
if scaler_type == "minmax":
|
||||||
|
predicted_inverse = predicted * scale_y + min_y
|
||||||
|
results_inverse = results * scale_X + min_X
|
||||||
|
|
||||||
|
elif scaler_type == "standard":
|
||||||
|
predicted_inverse = predicted * scale_y + mean_y
|
||||||
|
results_inverse = results * scale_X + mean_X
|
||||||
|
|
||||||
|
if preprocess.func_dict_out is not None:
|
||||||
|
predicted_inverse = tf.math.expm1(predicted_inverse)
|
||||||
|
results_inverse = tf.math.expm1(results_inverse)
|
||||||
|
|
||||||
|
# mass balance
|
||||||
|
dBa = tf.keras.backend.abs(
|
||||||
|
(
|
||||||
|
predicted_inverse[:, column_dict["Ba"]]
|
||||||
|
+ predicted_inverse[:, column_dict["Barite"]]
|
||||||
|
)
|
||||||
|
- (
|
||||||
|
results_inverse[:, column_dict["Ba"]]
|
||||||
|
+ results_inverse[:, column_dict["Barite"]]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
dSr = tf.keras.backend.abs(
|
||||||
|
(
|
||||||
|
predicted_inverse[:, column_dict["Sr"]]
|
||||||
|
+ predicted_inverse[:, column_dict["Celestite"]]
|
||||||
|
)
|
||||||
|
- (
|
||||||
|
results_inverse[:, column_dict["Sr"]]
|
||||||
|
+ results_inverse[:, column_dict["Celestite"]]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return tf.reduce_mean(dBa + dSr)
|
||||||
|
|
||||||
|
return mass_balance
|
||||||
|
|
||||||
|
|
||||||
|
def huber_metric(delta=1.0):
|
||||||
|
"""Auxilary function to calculate the Huber loss during training.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
preprocess (_type_): _description_
|
||||||
|
scaler_type (str, optional): _description_. Defaults to "minmax".
|
||||||
|
delta (float, optional): _description_. Defaults to 1.0.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def huber(results, predicted):
|
||||||
|
huber_loss = tf.keras.losses.Huber(delta)(results, predicted)
|
||||||
|
return huber_loss
|
||||||
|
|
||||||
|
return huber
|
||||||
|
|
||||||
|
|
||||||
|
def mass_balance_evaluation(model, X, preprocess):
|
||||||
|
"""Calculates the mass balance difference for each cell.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model: trained model
|
||||||
|
X: data where the mass balance should be calculated
|
||||||
|
preprocess: preprocessing object
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
vector with the mass balance difference for each cell
|
||||||
|
"""
|
||||||
|
|
||||||
|
# predict the chemistry
|
||||||
|
columns = X.iloc[:, X.columns != "Class"].columns
|
||||||
|
classes = X["Class"]
|
||||||
|
classes.reset_index(drop=True, inplace=True)
|
||||||
|
prediction = pd.DataFrame(model.predict(X[columns]), columns=columns)
|
||||||
|
# backtransform min/max or standard scaler
|
||||||
|
X = pd.DataFrame(
|
||||||
|
preprocess.scaler_X.inverse_transform(X.iloc[:, X.columns != "Class"]),
|
||||||
|
columns=columns,
|
||||||
|
)
|
||||||
|
prediction = pd.DataFrame(
|
||||||
|
preprocess.scaler_y.inverse_transform(prediction), columns=columns
|
||||||
|
)
|
||||||
|
|
||||||
|
# apply backtransformation if log transformation was applied
|
||||||
|
if preprocess.func_dict_out is not None:
|
||||||
|
X = preprocess.funcInverse(X)[0]
|
||||||
|
prediction = preprocess.funcInverse(prediction)[0]
|
||||||
|
|
||||||
|
# calculate mass balance
|
||||||
|
dBa = np.abs(
|
||||||
|
(prediction["Ba"] + prediction["Barite"]) - (X["Ba"] + X["Barite"]))
|
||||||
|
dSr = np.abs(
|
||||||
|
(prediction["Sr"] + prediction["Celestite"]) -
|
||||||
|
(X["Sr"] + X["Celestite"])
|
||||||
|
)
|
||||||
|
|
||||||
|
mass_balance_result = pd.DataFrame(
|
||||||
|
{"dBa": dBa, "dSr": dSr, "mass_balance": dBa + dSr, "Class": classes}
|
||||||
|
)
|
||||||
|
|
||||||
|
return mass_balance_result
|
||||||
|
|
||||||
|
|
||||||
|
def mass_balance_ratio(results, threshold=1e-5):
|
||||||
|
proportion = {}
|
||||||
|
|
||||||
|
mass_balance_threshold = results[results["mass_balance"] <= threshold]
|
||||||
|
|
||||||
|
overall = len(mass_balance_threshold)
|
||||||
|
class_0_amount = len(
|
||||||
|
mass_balance_threshold[mass_balance_threshold["Class"] == 0])
|
||||||
|
class_1_amount = len(
|
||||||
|
mass_balance_threshold[mass_balance_threshold["Class"] == 1])
|
||||||
|
|
||||||
|
proportion["overall"] = overall / len(results)
|
||||||
|
proportion["class_0"] = class_0_amount / \
|
||||||
|
len(results[results["Class"] == 0])
|
||||||
|
proportion["class_1"] = class_1_amount / \
|
||||||
|
len(results[results["Class"] == 1])
|
||||||
|
|
||||||
|
return proportion
|
||||||
|
|
||||||
|
|
||||||
|
class preprocessing:
|
||||||
|
"""
|
||||||
|
A class used to preprocess data for model training.
|
||||||
|
Attributes
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, func_dict_in=None, func_dict_out=None, random_state=42):
|
||||||
|
"""Initialization of the preprocessing object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
func_dict_in: function for transformation. Defaults to None.
|
||||||
|
func_dict_out: function for backtransformation. Defaults to None.
|
||||||
|
random_state (int, optional): Seed for reproducability. Defaults to 42.
|
||||||
|
"""
|
||||||
|
self.random_state = random_state
|
||||||
|
self.scaler_X = None
|
||||||
|
self.scaler_y = None
|
||||||
|
self.func_dict_in = func_dict_in if func_dict_in is not None else None
|
||||||
|
self.func_dict_out = func_dict_out if func_dict_out is not None else None
|
||||||
|
self.state = {"cluster": False, "log": False,
|
||||||
|
"balance": False, "scale": False}
|
||||||
|
|
||||||
|
def funcTranform(self, *args):
|
||||||
|
"""Apply the transformation function to the data columnwise.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pandas data frame: transformed data
|
||||||
|
"""
|
||||||
|
for i in args:
|
||||||
|
for key in i.keys():
|
||||||
|
if "Class" not in key:
|
||||||
|
i[key] = i[key].apply(self.func_dict_in)
|
||||||
|
self.state["log"] = True
|
||||||
|
return args
|
||||||
|
|
||||||
|
def funcInverse(self, *args):
|
||||||
|
"""Apply the backtransformation function to the data columnwise.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pandas data frame: backtransformed data
|
||||||
|
"""
|
||||||
|
for i in args:
|
||||||
|
for key in i.keys():
|
||||||
|
if "Class" not in key:
|
||||||
|
i[key] = i[key].apply(self.func_dict_out)
|
||||||
|
self.state["log"] = False
|
||||||
|
return args
|
||||||
|
|
||||||
|
def cluster(self, X, y, species="Barite", n_clusters=2, x_length=50, y_length=50):
|
||||||
|
"""Apply k-means clustering to the data to differentiate betweeen reactive and non-reactive cells.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
X: design data set
|
||||||
|
y: target data set
|
||||||
|
species (str, optional): Chemical species to which clustering is be applied. Defaults to "Barite".
|
||||||
|
n_clusters (int, optional): Number of clusters. Defaults to 2.
|
||||||
|
x_length: x dimension of the grid. Defaults to 50.
|
||||||
|
y_length: y dimension of the grid. Defaults to 50.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
X, y dataframes with an additional column "Class" containing the cluster labels.
|
||||||
|
"""
|
||||||
|
class_labels = np.array([])
|
||||||
|
grid_length = x_length * y_length
|
||||||
|
iterations = int(len(X) / grid_length)
|
||||||
|
|
||||||
|
# calculate the cluster for each chemical iteration step
|
||||||
|
for i in range(0, iterations):
|
||||||
|
field = np.array(
|
||||||
|
X[species][(i * grid_length): (i * grid_length + grid_length)]
|
||||||
|
).reshape(x_length, y_length)
|
||||||
|
kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state).fit(
|
||||||
|
field.reshape(-1, 1)
|
||||||
|
)
|
||||||
|
class_labels = np.append(class_labels.astype(int), kmeans.labels_)
|
||||||
|
|
||||||
|
if "Class" in X.columns and "Class" in y.columns:
|
||||||
|
print("Class column already exists")
|
||||||
|
else:
|
||||||
|
class_labels_df = pd.DataFrame(class_labels, columns=["Class"])
|
||||||
|
X = pd.concat([X, class_labels_df], axis=1)
|
||||||
|
y = pd.concat([y, class_labels_df], axis=1)
|
||||||
|
self.state["cluster"] = True
|
||||||
|
|
||||||
|
return X, y
|
||||||
|
|
||||||
|
def balancer(self, X, y, strategy, sample_fraction=0.5):
|
||||||
|
"""Apply sampling strategies to balance the dataset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
X: design dataset (before the simulation)
|
||||||
|
y: target dataset (after the simulation)
|
||||||
|
strategy: Sampling strategy. Choose between "smote" (Synthetic Minority Oversampling Technique), "over" (Oversampling) and "under" (Undersampling).
|
||||||
|
sample_fraction (float, optional): Define balancer target. Specifies the target fraction of the minority class after the balancing step. Defaults to 0.5.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
X, y: resampled datasets
|
||||||
|
"""
|
||||||
|
number_features = (X.columns != "Class").sum()
|
||||||
|
if "Class" not in X.columns:
|
||||||
|
if "Class" in y.columns:
|
||||||
|
classes = y["Class"]
|
||||||
|
else:
|
||||||
|
raise Exception("No class column found")
|
||||||
|
else:
|
||||||
|
classes = X["Class"]
|
||||||
|
counter = classes.value_counts()
|
||||||
|
print("Amount class 0 before:",
|
||||||
|
counter[0] / (counter[0] + counter[1]))
|
||||||
|
print("Amount class 1 before:",
|
||||||
|
counter[1] / (counter[0] + counter[1]))
|
||||||
|
df = pd.concat(
|
||||||
|
[
|
||||||
|
X.loc[:, X.columns != "Class"],
|
||||||
|
y.loc[:, y.columns != "Class"],
|
||||||
|
classes,
|
||||||
|
],
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
if strategy == "smote":
|
||||||
|
print("Using SMOTE strategy")
|
||||||
|
smote = SMOTE(sampling_strategy=sample_fraction)
|
||||||
|
df_resampled, classes_resampled = smote.fit_resample(
|
||||||
|
df.loc[:, df.columns != "Class"], df.loc[:,
|
||||||
|
df.columns == "Class"]
|
||||||
|
)
|
||||||
|
|
||||||
|
elif strategy == "over":
|
||||||
|
print("Using Oversampling")
|
||||||
|
over = RandomOverSampler()
|
||||||
|
df_resampled, classes_resampled = over.fit_resample(
|
||||||
|
df.loc[:, df.columns != "Class"], df.loc[:,
|
||||||
|
df.columns == "Class"]
|
||||||
|
)
|
||||||
|
|
||||||
|
elif strategy == "under":
|
||||||
|
print("Using Undersampling")
|
||||||
|
under = RandomUnderSampler()
|
||||||
|
df_resampled, classes_resampled = under.fit_resample(
|
||||||
|
df.loc[:, df.columns != "Class"], df.loc[:,
|
||||||
|
df.columns == "Class"]
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("No sampling selected. Output equals input.")
|
||||||
|
return X, y
|
||||||
|
|
||||||
|
counter = classes_resampled["Class"].value_counts()
|
||||||
|
print("Amount class 0 after:", counter[0] / (counter[0] + counter[1]))
|
||||||
|
print("Amount class 1 after:", counter[1] / (counter[0] + counter[1]))
|
||||||
|
|
||||||
|
design_resampled = pd.concat(
|
||||||
|
[df_resampled.iloc[:, 0:number_features], classes_resampled], axis=1
|
||||||
|
)
|
||||||
|
target_resampled = pd.concat(
|
||||||
|
[df_resampled.iloc[:, number_features:], classes_resampled], axis=1
|
||||||
|
)
|
||||||
|
|
||||||
|
self.state["balance"] = True
|
||||||
|
return design_resampled, target_resampled
|
||||||
|
|
||||||
|
def scale_fit(self, X, y, scaling, type="Standard"):
|
||||||
|
"""Fit a scaler for data preprocessing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
X: design dataset
|
||||||
|
y: target dataset
|
||||||
|
scaling: learn individual scaler for X and y when "individual" is selected or one global scaler on all data in X and y if "global" is selected (scaler_X and scaler_y are equal)
|
||||||
|
type (str, optional): Using MinMax Scaling or Standarization. Defaults to "Standard".
|
||||||
|
"""
|
||||||
|
|
||||||
|
if type == "minmax":
|
||||||
|
self.scaler_X = MinMaxScaler()
|
||||||
|
self.scaler_y = MinMaxScaler()
|
||||||
|
elif type == "standard":
|
||||||
|
self.scaler_X = StandardScaler()
|
||||||
|
self.scaler_y = StandardScaler()
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise Exception("No valid scaler type found")
|
||||||
|
|
||||||
|
if scaling == "individual":
|
||||||
|
self.scaler_X.fit(X.iloc[:, X.columns != "Class"])
|
||||||
|
self.scaler_y.fit(y.iloc[:, y.columns != "Class"])
|
||||||
|
|
||||||
|
elif scaling == "global":
|
||||||
|
self.scaler_X.fit(
|
||||||
|
pd.concat(
|
||||||
|
[X.iloc[:, X.columns != "Class"],
|
||||||
|
y.iloc[:, y.columns != "Class"]],
|
||||||
|
axis=0,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.scaler_y = self.scaler_X
|
||||||
|
|
||||||
|
self.state["scale"] = True
|
||||||
|
|
||||||
|
def scale_transform(self, X_train, X_test, y_train, y_test):
|
||||||
|
"""Apply learned scaler on datasets.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
X_train: design training data
|
||||||
|
X_test: test training data
|
||||||
|
y_train: target training data
|
||||||
|
y_test: test target data
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
transformed dataframes
|
||||||
|
"""
|
||||||
|
|
||||||
|
X_train = pd.concat(
|
||||||
|
[
|
||||||
|
self.scaler_X.transform(
|
||||||
|
X_train.loc[:, X_train.columns != "Class"]),
|
||||||
|
X_train.loc[:, "Class"],
|
||||||
|
],
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
X_test = pd.concat(
|
||||||
|
[
|
||||||
|
self.scaler_X.transform(
|
||||||
|
X_test.loc[:, X_test.columns != "Class"]),
|
||||||
|
X_test.loc[:, "Class"],
|
||||||
|
],
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
y_train = pd.concat(
|
||||||
|
[
|
||||||
|
self.scaler_y.transform(
|
||||||
|
y_train.loc[:, y_train.columns != "Class"]),
|
||||||
|
y_train.loc[:, "Class"],
|
||||||
|
],
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
y_test = pd.concat(
|
||||||
|
[
|
||||||
|
self.scaler_y.transform(
|
||||||
|
y_test.loc[:, y_test.columns != "Class"]),
|
||||||
|
y_test.loc[:, "Class"],
|
||||||
|
],
|
||||||
|
axis=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
return X_train, X_test, y_train, y_test
|
||||||
|
|
||||||
|
def scale_inverse(self, *args):
|
||||||
|
"""Backtransform the dataset
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Backtransformed data frames
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for i in args:
|
||||||
|
if "Class" in i.columns:
|
||||||
|
inversed = pd.DataFrame(
|
||||||
|
self.scaler_X.inverse_transform(
|
||||||
|
i.loc[:, i.columns != "Class"]),
|
||||||
|
columns=i.columns[:-1],
|
||||||
|
)
|
||||||
|
class_column = i.loc[:, "Class"].reset_index(drop=True)
|
||||||
|
i = pd.concat([inversed, class_column], axis=1)
|
||||||
|
else:
|
||||||
|
i = pd.DataFrame(
|
||||||
|
self.scaler_X.inverse_transform(i), columns=i.columns)
|
||||||
|
result.append(i)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def split(self, X, y, ratio=0.8):
|
||||||
|
X_train, y_train, X_test, y_test = sk.train_test_split(
|
||||||
|
X, y, test_size=ratio, random_state=self.random_state
|
||||||
|
)
|
||||||
|
|
||||||
|
return X_train, y_train, X_test, y_test
|
||||||
|
|
||||||
|
def class_selection(self, *args, class_label=0):
|
||||||
|
"""Select only rows with specific class label
|
||||||
|
|
||||||
|
Args:
|
||||||
|
Dataframes where rows with specific label should be selected. Defaults to 0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Elements with selected class label.
|
||||||
|
"""
|
||||||
|
for i in args:
|
||||||
|
i = i[i["Class"] == class_label]
|
||||||
|
|
||||||
|
return args
|
||||||
Loading…
x
Reference in New Issue
Block a user