mirror of
https://git.gfz-potsdam.de/naaice/model-training.git
synced 2025-12-13 10:28:22 +01:00
restructure project
This commit is contained in:
parent
019a426af4
commit
052cf356c9
BIN
Barite_50_Data.h5
(Stored with Git LFS)
BIN
Barite_50_Data.h5
(Stored with Git LFS)
Binary file not shown.
BIN
Barite_50_Data_inference.h5
(Stored with Git LFS)
BIN
Barite_50_Data_inference.h5
(Stored with Git LFS)
Binary file not shown.
BIN
Barite_50_Data_training.h5
(Stored with Git LFS)
BIN
Barite_50_Data_training.h5
(Stored with Git LFS)
Binary file not shown.
Binary file not shown.
BIN
barite_50_4_corner.h5
(Stored with Git LFS)
BIN
barite_50_4_corner.h5
(Stored with Git LFS)
Binary file not shown.
24
doc/measurement_plan.md
Normal file
24
doc/measurement_plan.md
Normal file
@ -0,0 +1,24 @@
|
||||
# Measurement overview for model optimization
|
||||
|
||||
### Parameters to optimize
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
### Saved models
|
||||
|
||||
`./results/model_large_standardization.keras`: Trained on `barite_50_4_corner.h5` dataset with extended Loss function (Huber loss with mass balance) and **standardized data**
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
### Experiments
|
||||
|
||||
| **Experiment** | **Dataset** | **Model** | **Lossfunction** | **Activation** | **Preprocessing** |
|
||||
|----------------|-----------------------|-----------|------------------|----------------|-------------------|-------------------|
|
||||
| 1 | barite_50_4_corner.h5 | large | Huber+dBa+dSa | LeakuRelu | Standardization |
|
||||
| 2 | barite_50_4_corner.h5 |
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 85 KiB |
BIN
loss_all.png
BIN
loss_all.png
Binary file not shown.
|
Before Width: | Height: | Size: 62 KiB |
225
optuna_runs.py
225
optuna_runs.py
@ -1,225 +0,0 @@
|
||||
import keras
|
||||
from keras.layers import Dense, Dropout, Input,BatchNormalization
|
||||
import tensorflow as tf
|
||||
import h5py
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import time
|
||||
import sklearn.model_selection as sk
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.pipeline import Pipeline, make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
||||
from imblearn.over_sampling import SMOTE
|
||||
from imblearn.under_sampling import RandomUnderSampler
|
||||
from imblearn.over_sampling import RandomOverSampler
|
||||
from collections import Counter
|
||||
import os
|
||||
from preprocessing import *
|
||||
from sklearn import set_config
|
||||
from importlib import reload
|
||||
set_config(transform_output = "pandas")
|
||||
|
||||
dtype = "float32"
|
||||
activation = "relu"
|
||||
|
||||
lr = 0.001
|
||||
batch_size = 512
|
||||
epochs = 50 # default 400 epochs
|
||||
|
||||
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
|
||||
initial_learning_rate=lr,
|
||||
decay_steps=2000,
|
||||
decay_rate=0.9,
|
||||
staircase=True
|
||||
)
|
||||
|
||||
optimizer_simple = keras.optimizers.Adam(learning_rate=lr_schedule)
|
||||
optimizer_large = keras.optimizers.Adam(learning_rate=lr_schedule)
|
||||
optimizer_paper = keras.optimizers.Adam(learning_rate=lr_schedule)
|
||||
|
||||
sample_fraction = 0.8
|
||||
|
||||
# small model
|
||||
model_simple = keras.Sequential(
|
||||
[
|
||||
keras.Input(shape = (9,), dtype = "float32"),
|
||||
keras.layers.Dense(units = 128, activation = "linear", dtype = "float32"),
|
||||
# Dropout(0.2),
|
||||
keras.layers.Dense(units = 128, activation = "elu", dtype = "float32"),
|
||||
keras.layers.Dense(units = 9, dtype = "float32")
|
||||
]
|
||||
)
|
||||
|
||||
def Safelog(val):
|
||||
# get range of vector
|
||||
if val > 0:
|
||||
return np.log10(val)
|
||||
elif val < 0:
|
||||
return -np.log10(-val)
|
||||
else:
|
||||
return 0
|
||||
|
||||
def Safeexp(val):
|
||||
if val > 0:
|
||||
return -10 ** -val
|
||||
elif val < 0:
|
||||
return 10 ** val
|
||||
else:
|
||||
return 0
|
||||
|
||||
# ? Why does the charge is using another logarithm than the other species
|
||||
|
||||
func_dict_in = {
|
||||
"H" : np.log1p,
|
||||
"O" : np.log1p,
|
||||
"Charge" : Safelog,
|
||||
"H_0_" : np.log1p,
|
||||
"O_0_" : np.log1p,
|
||||
"Ba" : np.log1p,
|
||||
"Cl" : np.log1p,
|
||||
"S_2_" : np.log1p,
|
||||
"S_6_" : np.log1p,
|
||||
"Sr" : np.log1p,
|
||||
"Barite" : np.log1p,
|
||||
"Celestite" : np.log1p,
|
||||
}
|
||||
|
||||
func_dict_out = {
|
||||
"H" : np.expm1,
|
||||
"O" : np.expm1,
|
||||
"Charge" : Safeexp,
|
||||
"H_0_" : np.expm1,
|
||||
"O_0_" : np.expm1,
|
||||
"Ba" : np.expm1,
|
||||
"Cl" : np.expm1,
|
||||
"S_2_" : np.expm1,
|
||||
"S_6_" : np.expm1,
|
||||
"Sr" : np.expm1,
|
||||
"Barite" : np.expm1,
|
||||
"Celestite" : np.expm1,
|
||||
}
|
||||
|
||||
# os.chdir('/mnt/beegfs/home/signer/projects/model-training')
|
||||
data_file = h5py.File("barite_50_4_corner.h5")
|
||||
|
||||
design = data_file["design"]
|
||||
results = data_file["result"]
|
||||
|
||||
df_design = pd.DataFrame(np.array(design["data"]).transpose(), columns = np.array(design["names"].asstr()))
|
||||
df_results = pd.DataFrame(np.array(results["data"]).transpose(), columns = np.array(results["names"].asstr()))
|
||||
|
||||
data_file.close()
|
||||
|
||||
species_columns = ['H', 'O', 'Charge', 'Ba', 'Cl', 'S', 'Sr', 'Barite', 'Celestite']
|
||||
|
||||
preprocess = preprocessing(func_dict_in=func_dict_in, func_dict_out=func_dict_out)
|
||||
X, y = preprocess.cluster(df_design[species_columns], df_results[species_columns])
|
||||
# X, y = preprocess.funcTranform(X, y)
|
||||
|
||||
X_train, X_test, y_train, y_test = preprocess.split(X, y, ratio = 0.2)
|
||||
X_train, y_train = preprocess.balancer(X_train, y_train, strategy = "over")
|
||||
preprocess.scale_fit(X_train, y_train, scaling = "individual")
|
||||
X_train, X_test, y_train, y_test = preprocess.scale_transform(X_train, X_test, y_train, y_test)
|
||||
X_train, X_val, y_train, y_val = preprocess.split(X_train, y_train, ratio = 0.1)
|
||||
|
||||
column_dict = {"Ba": X.columns.get_loc("Ba"), "Barite":X.columns.get_loc("Barite"), "Sr":X.columns.get_loc("Sr"), "Celestite":X.columns.get_loc("Celestite"), "H":X.columns.get_loc("H"), "H":X.columns.get_loc("H"), "O":X.columns.get_loc("O")}
|
||||
|
||||
def custom_loss(preprocess, column_dict, h1, h2, h3, h4):
|
||||
# extract the scaling parameters
|
||||
scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
|
||||
min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32)
|
||||
scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
|
||||
min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32)
|
||||
|
||||
def loss(results, predicted):
|
||||
# inverse min/max scaling
|
||||
predicted_inverse = predicted * scale_X + min_X
|
||||
results_inverse = results * scale_y + min_y
|
||||
|
||||
# mass balance
|
||||
dBa = tf.keras.backend.abs(
|
||||
(predicted_inverse[:, column_dict["Ba"]] + predicted_inverse[:, column_dict["Barite"]]) -
|
||||
(results_inverse[:, column_dict["Ba"]] + results_inverse[:, column_dict["Barite"]])
|
||||
)
|
||||
dSr = tf.keras.backend.abs(
|
||||
(predicted_inverse[:, column_dict["Sr"]] + predicted_inverse[:, column_dict["Celestite"]]) -
|
||||
(results_inverse[:, column_dict["Sr"]] + results_inverse[:, column_dict["Celestite"]])
|
||||
)
|
||||
|
||||
# H/O ratio has to be 2
|
||||
h2o_ratio = tf.keras.backend.abs(
|
||||
(predicted_inverse[:, column_dict["H"]] / predicted_inverse[:, column_dict["O"]]) - 2
|
||||
)
|
||||
|
||||
# huber loss
|
||||
huber_loss = tf.keras.losses.Huber()(results, predicted)
|
||||
|
||||
# total loss
|
||||
total_loss = h1 * huber_loss + h2 * dBa**2 + h3 * dSr**2 #+ h4 * h2o_ratio**2
|
||||
|
||||
return total_loss
|
||||
|
||||
return loss
|
||||
|
||||
def mass_balance(model, X, preprocess):
|
||||
|
||||
# predict the chemistry
|
||||
columns = X.iloc[:, X.columns != "Class"].columns
|
||||
prediction = pd.DataFrame(model.predict(X[columns]), columns=columns)
|
||||
|
||||
# backtransform min/max
|
||||
X = pd.DataFrame(preprocess.scaler_X.inverse_transform(X.iloc[:, X.columns != "Class"]), columns=columns)
|
||||
prediction = pd.DataFrame(preprocess.scaler_y.inverse_transform(prediction), columns=columns)
|
||||
|
||||
# calculate mass balance dBa = np.abs((prediction["Ba"] + prediction["Barite"]) - (X["Ba"] + X["Barite"]))
|
||||
dSr = np.abs((prediction["Sr"] + prediction["Celestite"]) - (X["Sr"] + X["Celestite"]))
|
||||
|
||||
return dBa + dSr
|
||||
|
||||
import optuna
|
||||
|
||||
def create_model(model, preprocess, h1, h2, h3, h4):
|
||||
|
||||
model.compile(optimizer=optimizer_simple, loss=custom_loss(preprocess, column_dict, h1, h2, h3, h4))
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def objective(trial, preprocess, X_train, y_train, X_val, y_val, X_test, y_test):
|
||||
h1 = trial.suggest_float("h1", 0.1, 10)
|
||||
h2 = trial.suggest_float("h2", 0.1, 10)
|
||||
h3 = trial.suggest_float("h3", 0.1, 10)
|
||||
h4 = trial.suggest_float("h4", 0.1, 10)
|
||||
|
||||
model = create_model(model_simple, preprocess, h1, h2, h3, h4)
|
||||
|
||||
callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
|
||||
history = model.fit(X_train.loc[:, X_train.columns != "Class"],
|
||||
y_train.loc[:, y_train.columns != "Class"],
|
||||
batch_size=batch_size,
|
||||
epochs=50,
|
||||
validation_data=(X_val.loc[:, X_val.columns != "Class"], y_val.loc[:, y_val.columns != "Class"]),
|
||||
callbacks=[callback])
|
||||
|
||||
prediction_loss = model.evaluate(X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"])
|
||||
mass_balance_results = mass_balance(model, X_test, preprocess)
|
||||
|
||||
mass_balance_ratio = len(mass_balance_results[mass_balance_results < 1e-5]) / len(mass_balance_results)
|
||||
|
||||
return prediction_loss, mass_balance_ratio
|
||||
|
||||
if __name__ == "__main__":
|
||||
study = optuna.create_study(storage="sqlite:///model_optimization.db", study_name="model_optimization", directions=["minimize", "maximize"])
|
||||
study.optimize(lambda trial: objective(trial, preprocess, X_train, y_train, X_val, y_val, X_test, y_test), n_trials=1000)
|
||||
|
||||
print("Number of finished trials: ", len(study.trials))
|
||||
|
||||
print("Best trial:")
|
||||
trial = study.best_trial
|
||||
|
||||
print(" Value: ", trial.value)
|
||||
|
||||
print(" Params: ")
|
||||
for key, value in trial.params.items():
|
||||
print(" {}: {}".format(key, value))
|
||||
332
preprocessing.py
332
preprocessing.py
@ -1,332 +0,0 @@
|
||||
import keras
|
||||
print("Running Keras in version {}".format(keras.__version__))
|
||||
|
||||
import h5py
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import time
|
||||
import sklearn.model_selection as sk
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.cluster import KMeans
|
||||
from imblearn.over_sampling import SMOTE
|
||||
from imblearn.under_sampling import RandomUnderSampler
|
||||
from imblearn.over_sampling import RandomOverSampler
|
||||
from collections import Counter
|
||||
import os
|
||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
||||
from sklearn.base import clone
|
||||
|
||||
# preprocessing pipeline
|
||||
#
|
||||
|
||||
def Safelog(val):
|
||||
# get range of vector
|
||||
if val > 0:
|
||||
return np.log10(val)
|
||||
elif val < 0:
|
||||
return -np.log10(-val)
|
||||
else:
|
||||
return 0
|
||||
|
||||
def Safeexp(val):
|
||||
if val > 0:
|
||||
return -10 ** -val
|
||||
elif val < 0:
|
||||
return 10 ** val
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
class FuncTransform():
|
||||
'''
|
||||
Class to transform and inverse transform data with given functions.
|
||||
Transform and inverse transform functions have to be given as dictionaries in the following format:
|
||||
{'key1': function1, 'key2': function2, ...}
|
||||
'''
|
||||
|
||||
def __init__(self, func_transform, func_inverse):
|
||||
self.func_transform = func_transform
|
||||
self.func_inverse = func_inverse
|
||||
|
||||
def fit(self, X, y=None):
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
X = X.copy()
|
||||
for key in X.keys():
|
||||
if "Class" not in key:
|
||||
X[key] = X[key].apply(self.func_transform[key])
|
||||
return X
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
self.fit(X)
|
||||
return self.transform(X, y)
|
||||
|
||||
def inverse_transform(self, X_log):
|
||||
X_log = X_log.copy()
|
||||
for key in X_log.keys():
|
||||
if "Class" not in key:
|
||||
X_log[key] = X_log[key].apply(self.func_inverse[key])
|
||||
return X_log
|
||||
|
||||
|
||||
def clustering(X, n_clusters=2, random_state=42, x_length=50, y_length=50, species='Barite'):
|
||||
'''
|
||||
Function to cluster data with KMeans.
|
||||
'''
|
||||
|
||||
class_labels = np.array([])
|
||||
grid_length = x_length * y_length
|
||||
iterations = int(len(X) / grid_length)
|
||||
|
||||
for i in range(0, iterations):
|
||||
field = np.array(X[species][(i*grid_length):(i*grid_length+grid_length)]
|
||||
).reshape(x_length, y_length)
|
||||
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(
|
||||
field.reshape(-1, 1))
|
||||
|
||||
class_labels = np.append(class_labels.astype(int), kmeans.labels_)
|
||||
|
||||
if("Class" in X.columns):
|
||||
print("Class column already exists")
|
||||
else:
|
||||
class_labels_df = pd.DataFrame(class_labels, columns=['Class'])
|
||||
X_clustered = pd.concat([X, class_labels_df], axis=1)
|
||||
|
||||
return X_clustered
|
||||
|
||||
|
||||
def balancer(design, target, strategy, sample_fraction=0.5):
|
||||
|
||||
number_features = (design.columns != "Class").sum()
|
||||
if("Class" not in design.columns):
|
||||
if("Class" in target.columns):
|
||||
classes = target['Class']
|
||||
else:
|
||||
raise Exception("No class column found")
|
||||
else:
|
||||
classes = design['Class']
|
||||
counter = classes.value_counts()
|
||||
print("Amount class 0 before:", counter[0] / (counter[0] + counter[1]) )
|
||||
print("Amount class 1 before:", counter[1] / (counter[0] + counter[1]) )
|
||||
df = pd.concat([design.loc[:,design.columns != "Class"], target.loc[:, target.columns != "Class"], classes], axis=1)
|
||||
|
||||
if strategy == 'smote':
|
||||
print("Using SMOTE strategy")
|
||||
smote = SMOTE(sampling_strategy=sample_fraction)
|
||||
df_resampled, classes_resampled = smote.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.columns == "Class"])
|
||||
|
||||
elif strategy == 'over':
|
||||
print("Using Oversampling")
|
||||
over = RandomOverSampler()
|
||||
df_resampled, classes_resampled = over.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.columns == "Class"])
|
||||
|
||||
elif strategy == 'under':
|
||||
print("Using Undersampling")
|
||||
under = RandomUnderSampler()
|
||||
df_resampled, classes_resampled = under.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.columns == "Class"])
|
||||
|
||||
else:
|
||||
return design, target
|
||||
|
||||
counter = classes_resampled["Class"].value_counts()
|
||||
print("Amount class 0 after:", counter[0] / (counter[0] + counter[1]) )
|
||||
print("Amount class 1 after:", counter[1] / (counter[0] + counter[1]) )
|
||||
|
||||
design_resampled = pd.concat([df_resampled.iloc[:,0:number_features], classes_resampled], axis=1)
|
||||
target_resampled = pd.concat([df_resampled.iloc[:,number_features:], classes_resampled], axis=1)
|
||||
|
||||
return design_resampled, target_resampled
|
||||
|
||||
|
||||
def plot_simulation(X, timestep, component='Barite', x_length=50, y_length=50):
|
||||
grid_length = x_length * y_length
|
||||
max_iter = int(len(X) / grid_length)
|
||||
if(timestep >= max_iter):
|
||||
raise Exception("timestep is not in the simulation range")
|
||||
|
||||
plt.imshow(np.array(X[component][(timestep*grid_length):(timestep*grid_length+grid_length)]).reshape(x_length,y_length), interpolation='bicubic', origin='lower')
|
||||
|
||||
if("Class" in X.columns):
|
||||
plt.contour(np.array(X['Class'][(timestep*grid_length):(timestep*grid_length+grid_length)]).reshape(x_length,y_length), levels=[0.1], colors='red', origin='lower')
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
def preprocessing_training(df_design, df_targets, func_dict_in, func_dict_out, sampling, scaling, test_size):
|
||||
|
||||
df_design = clustering(df_design)
|
||||
df_targets = pd.concat([df_targets, df_design['Class']], axis=1)
|
||||
|
||||
df_design_log = FuncTransform(func_dict_in, func_dict_out).fit_transform(df_design)
|
||||
df_results_log = FuncTransform(func_dict_in, func_dict_out).fit_transform(df_targets)
|
||||
|
||||
X_train, X_test, y_train, y_test = sk.train_test_split(df_design_log, df_results_log, test_size = test_size, random_state=42)
|
||||
|
||||
X_train, y_train = balancer(X_train, y_train, sampling)
|
||||
|
||||
scaler_X = MinMaxScaler()
|
||||
scaler_y = MinMaxScaler()
|
||||
|
||||
if scaling == 'individual':
|
||||
scaler_X.fit(X_train.iloc[:, X_train.columns != "Class"])
|
||||
scaler_y.fit(y_train.iloc[:, y_train.columns != "Class"])
|
||||
|
||||
elif scaling == 'global':
|
||||
scaler_X.fit(pd.concat([X_train.iloc[:, X_train.columns != "Class"], y_train.iloc[:, y_train.columns != "Class"]], axis=0))
|
||||
scaler_y = scaler_X
|
||||
|
||||
X_train = pd.concat([scaler_X.transform(X_train.loc[:, X_train.columns != "Class"]), X_train.loc[:, "Class"]], axis=1)
|
||||
X_test = pd.concat([scaler_X.transform(X_test.loc[:, X_test.columns != "Class"]), X_test.loc[:, "Class"]], axis=1)
|
||||
|
||||
y_train = pd.concat([scaler_y.transform(y_train.loc[:, y_train.columns != "Class"]), y_train.loc[:, "Class"]], axis=1)
|
||||
y_test = pd.concat([scaler_y.transform(y_test.loc[:, y_test.columns != "Class"]), y_test.loc[:, "Class"]], axis=1)
|
||||
|
||||
X_train, X_val, y_train, y_val = sk.train_test_split(X_train, y_train, test_size = 0.1)
|
||||
|
||||
return X_train, X_val, X_test, y_train, y_val, y_test, scaler_X, scaler_y
|
||||
|
||||
|
||||
|
||||
class preprocessing:
|
||||
|
||||
def __init__(self, func_dict_in, func_dict_out, random_state=42):
|
||||
self.random_state = random_state
|
||||
self.scaler_X = None
|
||||
self.scaler_y = None
|
||||
self.func_dict_in = func_dict_in
|
||||
self.func_dict_out = func_dict_out
|
||||
self.state = {"cluster": False, "log": False, "balance": False, "scale": False}
|
||||
|
||||
def funcTranform(self, X, y):
|
||||
for key in X.keys():
|
||||
if "Class" not in key:
|
||||
X[key] = X[key].apply(self.func_dict_in[key])
|
||||
y[key] = y[key].apply(self.func_dict_in[key])
|
||||
self.state["log"] = True
|
||||
|
||||
return X, y
|
||||
|
||||
def funcInverse(self, X, y):
|
||||
|
||||
for key in X.keys():
|
||||
if "Class" not in key:
|
||||
X[key] = X[key].apply(self.func_dict_out[key])
|
||||
y[key] = y[key].apply(self.func_dict_out[key])
|
||||
self.state["log"] = False
|
||||
return X, y
|
||||
|
||||
def cluster(self, X, y, species='Barite', n_clusters=2, x_length=50, y_length=50):
|
||||
|
||||
class_labels = np.array([])
|
||||
grid_length = x_length * y_length
|
||||
iterations = int(len(X) / grid_length)
|
||||
|
||||
for i in range(0, iterations):
|
||||
field = np.array(X[species][(i*grid_length):(i*grid_length+grid_length)]
|
||||
).reshape(x_length, y_length)
|
||||
kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state).fit(field.reshape(-1, 1))
|
||||
class_labels = np.append(class_labels.astype(int), kmeans.labels_)
|
||||
|
||||
if ("Class" in X.columns and "Class" in y.columns):
|
||||
print("Class column already exists")
|
||||
else:
|
||||
class_labels_df = pd.DataFrame(class_labels, columns=['Class'])
|
||||
X = pd.concat([X, class_labels_df], axis=1)
|
||||
y = pd.concat([y, class_labels_df], axis=1)
|
||||
self.state["cluster"] = True
|
||||
|
||||
return X, y
|
||||
|
||||
|
||||
def balancer(self, X, y, strategy, sample_fraction=0.5):
|
||||
|
||||
number_features = (X.columns != "Class").sum()
|
||||
if("Class" not in X.columns):
|
||||
if("Class" in y.columns):
|
||||
classes = y['Class']
|
||||
else:
|
||||
raise Exception("No class column found")
|
||||
else:
|
||||
classes = X['Class']
|
||||
counter = classes.value_counts()
|
||||
print("Amount class 0 before:", counter[0] / (counter[0] + counter[1]) )
|
||||
print("Amount class 1 before:", counter[1] / (counter[0] + counter[1]) )
|
||||
df = pd.concat([X.loc[:,X.columns != "Class"], y.loc[:, y.columns != "Class"], classes], axis=1)
|
||||
|
||||
if strategy == 'smote':
|
||||
print("Using SMOTE strategy")
|
||||
smote = SMOTE(sampling_strategy=sample_fraction)
|
||||
df_resampled, classes_resampled = smote.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df. columns == "Class"])
|
||||
|
||||
elif strategy == 'over':
|
||||
print("Using Oversampling")
|
||||
over = RandomOverSampler()
|
||||
df_resampled, classes_resampled = over.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df. columns == "Class"])
|
||||
|
||||
elif strategy == 'under':
|
||||
print("Using Undersampling")
|
||||
under = RandomUnderSampler()
|
||||
df_resampled, classes_resampled = under.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df. columns == "Class"])
|
||||
|
||||
else:
|
||||
return X, y
|
||||
|
||||
counter = classes_resampled["Class"].value_counts()
|
||||
print("Amount class 0 after:", counter[0] / (counter[0] + counter[1]) )
|
||||
print("Amount class 1 after:", counter[1] / (counter[0] + counter[1]) )
|
||||
|
||||
design_resampled = pd.concat([df_resampled.iloc[:,0:number_features], classes_resampled], axis=1)
|
||||
target_resampled = pd.concat([df_resampled.iloc[:,number_features:], classes_resampled], axis=1)
|
||||
|
||||
self.state['balance'] = True
|
||||
return design_resampled, target_resampled
|
||||
|
||||
|
||||
def scale_fit(self, X, y, scaling):
|
||||
|
||||
if scaling == 'individual':
|
||||
self.scaler_X = MinMaxScaler()
|
||||
self.scaler_y = MinMaxScaler()
|
||||
self.scaler_X.fit(X.iloc[:, X.columns != "Class"])
|
||||
self.scaler_y.fit(y.iloc[:, y.columns != "Class"])
|
||||
|
||||
elif scaling == 'global':
|
||||
self.scaler_X = MinMaxScaler()
|
||||
self.scaler_X.fit(pd.concat([X.iloc[:, X.columns != "Class"], y.iloc[:, y.columns != "Class"]], axis=0))
|
||||
self.scaler_y = self.scaler_X
|
||||
|
||||
self.state['scale'] = True
|
||||
|
||||
def scale_transform(self, X_train, X_test, y_train, y_test):
|
||||
X_train = pd.concat([self.scaler_X.transform(X_train.loc[:, X_train.columns != "Class"]), X_train.loc[:, "Class"]], axis=1)
|
||||
|
||||
X_test = pd.concat([self.scaler_X.transform(X_test.loc[:, X_test.columns != "Class"]), X_test.loc[:, "Class"]], axis=1)
|
||||
|
||||
y_train = pd.concat([self.scaler_y.transform(y_train.loc[:, y_train.columns != "Class"]), y_train.loc[:, "Class"]], axis=1)
|
||||
|
||||
y_test = pd.concat([self.scaler_y.transform(y_test.loc[:, y_test.columns != "Class"]), y_test.loc[:, "Class"]], axis=1)
|
||||
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
def scale_inverse(self, X):
|
||||
|
||||
if("Class" in X.columns):
|
||||
X = pd.concat([self.scaler_X.inverse_transform(X.loc[:, X.columns != "Class"]), X.loc[:, "Class"]], axis=1)
|
||||
else:
|
||||
X = self.scaler_X.inverse_transform(X)
|
||||
|
||||
return X
|
||||
|
||||
def split(self, X, y, ratio=0.8):
|
||||
X_train, y_train, X_test, y_test = sk.train_test_split(X, y, test_size = ratio, random_state=self.random_state)
|
||||
|
||||
return X_train, y_train, X_test, y_test
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
106
src/optuna_runs.py
Normal file
106
src/optuna_runs.py
Normal file
@ -0,0 +1,106 @@
|
||||
import keras
|
||||
from keras.layers import Dense, Dropout, Input,BatchNormalization
|
||||
import tensorflow as tf
|
||||
import h5py
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import time
|
||||
import sklearn.model_selection as sk
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.pipeline import Pipeline, make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
||||
from imblearn.over_sampling import SMOTE
|
||||
from imblearn.under_sampling import RandomUnderSampler
|
||||
from imblearn.over_sampling import RandomOverSampler
|
||||
from collections import Counter
|
||||
import os
|
||||
from preprocessing import *
|
||||
from sklearn import set_config
|
||||
from importlib import reload
|
||||
set_config(transform_output = "pandas")
|
||||
import optuna
|
||||
import pickle
|
||||
|
||||
data_file = h5py.File("../datasets/barite_50_4_corner.h5")
|
||||
|
||||
def objective(trial, preprocess, X, y, species_columns):
|
||||
|
||||
model_type = trial.suggest_categorical("model", ["simple", "large", "paper"])
|
||||
scaler_type = trial.suggest_categorical("scaler", ["standard", "minmax"])
|
||||
sampling_type = trial.suggest_categorical("sampling", ["over", "off"])
|
||||
|
||||
preprocess = preprocessing()
|
||||
X, y = preprocess.cluster(df_design[species_columns], df_results[species_columns])
|
||||
X_train, X_test, y_train, y_test = preprocess.split(X, y, ratio = 0.2)
|
||||
X_train, y_train = preprocess.balancer(X_train, y_train, strategy = sampling_type)
|
||||
preprocess.scale_fit(X_train, y_train, scaling = "global", type=scaler_type)
|
||||
X_train, X_test, y_train, y_test = preprocess.scale_transform(X_train, X_test, y_train, y_test)
|
||||
X_train, X_val, y_train, y_val = preprocess.split(X_train, y_train, ratio = 0.1)
|
||||
|
||||
column_dict = {"Ba": X.columns.get_loc("Ba"), "Barite":X.columns.get_loc("Barite"), "Sr":X.columns.get_loc("Sr"), "Celestite":X.columns.get_loc("Celestite"), "H":X.columns.get_loc("H"), "H":X.columns.get_loc("H"), "O":X.columns.get_loc("O")}
|
||||
|
||||
h1 = trial.suggest_float("h1", 0.1, 1)
|
||||
h2 = trial.suggest_float("h2", 0.1, 1)
|
||||
h3 = trial.suggest_float("h3", 0.1, 1)
|
||||
|
||||
|
||||
model = model_definition(model_type)
|
||||
|
||||
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
|
||||
initial_learning_rate=0.001,
|
||||
decay_steps=2000,
|
||||
decay_rate=0.9,
|
||||
staircase=True
|
||||
)
|
||||
optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)
|
||||
|
||||
model.compile(optimizer=optimizer, loss=custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type), metrics=[huber_metric(1.0), mass_balance_metric(preprocess, column_dict, scaler_type="minmax")])
|
||||
|
||||
callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
|
||||
history = model.fit(X_train.loc[:, X_train.columns != "Class"],
|
||||
y_train.loc[:, y_train.columns != "Class"],
|
||||
batch_size=512,
|
||||
epochs=100,
|
||||
validation_data=(X_val.loc[:, X_val.columns != "Class"], y_val.loc[:, y_val.columns != "Class"]),
|
||||
callbacks=[callback])
|
||||
|
||||
prediction_loss = model.evaluate(X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"])
|
||||
mass_balance_results = mass_balance_evaluation(model, X_test, preprocess)
|
||||
|
||||
mass_balance_ratio = len(mass_balance_results[mass_balance_results < 1e-5]) / len(mass_balance_results)
|
||||
|
||||
model_save_path_trial = os.path.join("../results/models/", f"model_trial_{trial.number}.h5")
|
||||
history_save_path_trial = os.path.join("../results/history/", f"history_trial_{trial.number}.pkl")
|
||||
|
||||
model.save(model_save_path_trial)
|
||||
with open(history_save_path_trial, 'wb') as f:
|
||||
pickle.dump(history.history, f)
|
||||
|
||||
return prediction_loss, mass_balance_ratio
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
design = data_file["design"]
|
||||
results = data_file["result"]
|
||||
|
||||
df_design = pd.DataFrame(np.array(design["data"]).transpose(), columns = np.array(design["names"].asstr()))
|
||||
df_results = pd.DataFrame(np.array(results["data"]).transpose(), columns = np.array(results["names"].asstr()))
|
||||
|
||||
data_file.close()
|
||||
|
||||
species_columns = ['H', 'O', 'Charge', 'Ba', 'Cl', 'S', 'Sr', 'Barite', 'Celestite']
|
||||
|
||||
study = optuna.create_study(storage="sqlite:///model_optimization.db", study_name="model_optimization", directions=["minimize", "maximize"])
|
||||
study.optimize(lambda trial: objective(trial, df_design, df_results, species_columns), n_trials=1000)
|
||||
|
||||
print("Number of finished trials: ", len(study.trials))
|
||||
|
||||
print("Best trial:")
|
||||
trial = study.best_trial
|
||||
|
||||
print(" Value: ", trial.value)
|
||||
|
||||
print(" Params: ")
|
||||
for key, value in trial.params.items():
|
||||
print(" {}: {}".format(key, value))
|
||||
357
src/preprocessing.py
Normal file
357
src/preprocessing.py
Normal file
@ -0,0 +1,357 @@
|
||||
import keras
|
||||
from keras.layers import Dense, Dropout, Input,BatchNormalization, LeakyReLU
|
||||
import tensorflow as tf
|
||||
import h5py
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import time
|
||||
import sklearn.model_selection as sk
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.pipeline import Pipeline, make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
||||
from imblearn.over_sampling import SMOTE
|
||||
from imblearn.under_sampling import RandomUnderSampler
|
||||
from imblearn.over_sampling import RandomOverSampler
|
||||
from collections import Counter
|
||||
import os
|
||||
from preprocessing import *
|
||||
from sklearn import set_config
|
||||
from importlib import reload
|
||||
set_config(transform_output = "pandas")
|
||||
|
||||
# preprocessing pipeline
|
||||
#
|
||||
|
||||
def Safelog(val):
|
||||
# get range of vector
|
||||
if val > 0:
|
||||
return np.log10(val)
|
||||
elif val < 0:
|
||||
return -np.log10(-val)
|
||||
else:
|
||||
return 0
|
||||
|
||||
def Safeexp(val):
|
||||
if val > 0:
|
||||
return -10 ** -val
|
||||
elif val < 0:
|
||||
return 10 ** val
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def model_definition(architecture):
|
||||
dtype = "float32"
|
||||
|
||||
if architecture == "small":
|
||||
model = keras.Sequential(
|
||||
[
|
||||
keras.Input(shape=(8,), dtype="float32"),
|
||||
keras.layers.Dense(units=128, dtype="float32"),
|
||||
LeakyReLU(alpha=0.01),
|
||||
# Dropout(0.2),
|
||||
keras.layers.Dense(units=128, dtype="float32"),
|
||||
LeakyReLU(alpha=0.01),
|
||||
keras.layers.Dense(units=8, dtype="float32")
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
elif architecture == "large":
|
||||
model = keras.Sequential(
|
||||
[
|
||||
keras.layers.Input(shape=(8,), dtype=dtype),
|
||||
keras.layers.Dense(512, dtype=dtype),
|
||||
LeakyReLU(alpha=0.01),
|
||||
keras.layers.Dense(1024, dtype=dtype),
|
||||
LeakyReLU(alpha=0.01),
|
||||
keras.layers.Dense(512, dtype=dtype),
|
||||
LeakyReLU(alpha=0.01),
|
||||
keras.layers.Dense(8, dtype=dtype)
|
||||
]
|
||||
)
|
||||
|
||||
elif architecture == "paper":
|
||||
model = keras.Sequential(
|
||||
[keras.layers.Input(shape=(8,), dtype=dtype),
|
||||
keras.layers.Dense(128, dtype=dtype),
|
||||
LeakyReLU(alpha=0.01),
|
||||
keras.layers.Dense(256, dtype=dtype),
|
||||
LeakyReLU(alpha=0.01),
|
||||
keras.layers.Dense(512, dtype=dtype),
|
||||
LeakyReLU(alpha=0.01),
|
||||
keras.layers.Dense(256, dtype=dtype),
|
||||
LeakyReLU(alpha=0.01),
|
||||
keras.layers.Dense(8, dtype=dtype)
|
||||
])
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type="minmax"):
|
||||
# extract the scaling parameters
|
||||
|
||||
if scaler_type == "minmax":
|
||||
scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
|
||||
min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32)
|
||||
scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
|
||||
min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32)
|
||||
|
||||
elif scaler_type == "standard":
|
||||
scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
|
||||
mean_X = tf.convert_to_tensor(preprocess.scaler_X.mean_, dtype=tf.float32)
|
||||
scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
|
||||
mean_y = tf.convert_to_tensor(preprocess.scaler_y.mean_, dtype=tf.float32)
|
||||
|
||||
def loss(results, predicted):
|
||||
|
||||
# inverse min/max scaling
|
||||
if scaler_type == "minmax":
|
||||
predicted_inverse = predicted * scale_y + min_y
|
||||
results_inverse = results * scale_X + min_X
|
||||
|
||||
elif scaler_type == "standard":
|
||||
predicted_inverse = predicted * scale_y + mean_y
|
||||
results_inverse = results * scale_X + mean_X
|
||||
|
||||
# mass balance
|
||||
dBa = tf.keras.backend.abs(
|
||||
(predicted_inverse[:, column_dict["Ba"]] + predicted_inverse[:, column_dict["Barite"]]) -
|
||||
(results_inverse[:, column_dict["Ba"]] + results_inverse[:, column_dict["Barite"]])
|
||||
)
|
||||
dSr = tf.keras.backend.abs(
|
||||
(predicted_inverse[:, column_dict["Sr"]] + predicted_inverse[:, column_dict["Celestite"]]) -
|
||||
(results_inverse[:, column_dict["Sr"]] + results_inverse[:, column_dict["Celestite"]])
|
||||
)
|
||||
|
||||
# H/O ratio has to be 2
|
||||
# h2o_ratio = tf.keras.backend.abs(
|
||||
# (predicted_inverse[:, column_dict["H"]] / predicted_inverse[:, column_dict["O"]]) - 2
|
||||
# )
|
||||
|
||||
# huber loss
|
||||
huber_loss = tf.keras.losses.Huber()(results, predicted)
|
||||
|
||||
# total loss
|
||||
total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr #+ h4 * h2o_ratio
|
||||
# total_loss = huber_loss
|
||||
return total_loss
|
||||
|
||||
return loss
|
||||
|
||||
def mass_balance_evaluation(model, X, preprocess):
|
||||
|
||||
# predict the chemistry
|
||||
columns = X.iloc[:, X.columns != "Class"].columns
|
||||
prediction = pd.DataFrame(model.predict(X[columns]), columns=columns)
|
||||
|
||||
# backtransform min/max or standard scaler
|
||||
X = pd.DataFrame(preprocess.scaler_X.inverse_transform(X.iloc[:, X.columns != "Class"]), columns=columns)
|
||||
prediction = pd.DataFrame(preprocess.scaler_y.inverse_transform(prediction), columns=columns)
|
||||
|
||||
# calculate mass balance
|
||||
dBa = np.abs((prediction["Ba"] + prediction["Barite"]) - (X["Ba"] + X["Barite"]))
|
||||
print(dBa.min())
|
||||
dSr = np.abs((prediction["Sr"] + prediction["Celestite"]) - (X["Sr"] + X["Celestite"]))
|
||||
print(dSr.min())
|
||||
return dBa, dSr, prediction
|
||||
|
||||
|
||||
def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
|
||||
|
||||
if scaler_type == "minmax":
|
||||
scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
|
||||
min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32)
|
||||
scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
|
||||
min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32)
|
||||
|
||||
elif scaler_type == "standard":
|
||||
scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
|
||||
mean_X = tf.convert_to_tensor(preprocess.scaler_X.mean_, dtype=tf.float32)
|
||||
scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
|
||||
mean_y = tf.convert_to_tensor(preprocess.scaler_y.mean_, dtype=tf.float32)
|
||||
|
||||
|
||||
def mass_balance(results, predicted):
|
||||
# inverse min/max scaling
|
||||
if scaler_type == "minmax":
|
||||
predicted_inverse = predicted * scale_y + min_y
|
||||
results_inverse = results * scale_X + min_X
|
||||
|
||||
elif scaler_type == "standard":
|
||||
predicted_inverse = predicted * scale_y + mean_y
|
||||
results_inverse = results * scale_X + mean_X
|
||||
|
||||
# mass balance
|
||||
dBa = tf.keras.backend.abs(
|
||||
(predicted_inverse[:, column_dict["Ba"]] + predicted_inverse[:, column_dict["Barite"]]) -
|
||||
(results_inverse[:, column_dict["Ba"]] + results_inverse[:, column_dict["Barite"]])
|
||||
)
|
||||
dSr = tf.keras.backend.abs(
|
||||
(predicted_inverse[:, column_dict["Sr"]] + predicted_inverse[:, column_dict["Celestite"]]) -
|
||||
(results_inverse[:, column_dict["Sr"]] + results_inverse[:, column_dict["Celestite"]])
|
||||
)
|
||||
|
||||
return tf.reduce_mean(dBa + dSr)
|
||||
|
||||
return mass_balance
|
||||
|
||||
|
||||
def huber_metric(delta=1.0):
|
||||
def huber(results, predicted):
|
||||
return tf.keras.losses.huber(results, predicted, delta=delta)
|
||||
|
||||
return huber
|
||||
|
||||
|
||||
class preprocessing:
|
||||
|
||||
def __init__(self, func_dict_in=None, func_dict_out=None, random_state=42):
|
||||
self.random_state = random_state
|
||||
self.scaler_X = None
|
||||
self.scaler_y = None
|
||||
self.func_dict_in = None
|
||||
self.func_dict_in = func_dict_in if func_dict_in is not None else None
|
||||
self.func_dict_out = func_dict_out if func_dict_out is not None else None
|
||||
self.state = {"cluster": False, "log": False, "balance": False, "scale": False}
|
||||
|
||||
def funcTranform(self, X, y):
|
||||
for key in X.keys():
|
||||
if "Class" not in key:
|
||||
X[key] = X[key].apply(self.func_dict_in[key])
|
||||
y[key] = y[key].apply(self.func_dict_in[key])
|
||||
self.state["log"] = True
|
||||
|
||||
return X, y
|
||||
|
||||
def funcInverse(self, X, y):
|
||||
|
||||
for key in X.keys():
|
||||
if "Class" not in key:
|
||||
X[key] = X[key].apply(self.func_dict_out[key])
|
||||
y[key] = y[key].apply(self.func_dict_out[key])
|
||||
self.state["log"] = False
|
||||
return X, y
|
||||
|
||||
def cluster(self, X, y, species='Barite', n_clusters=2, x_length=50, y_length=50):
|
||||
|
||||
class_labels = np.array([])
|
||||
grid_length = x_length * y_length
|
||||
iterations = int(len(X) / grid_length)
|
||||
|
||||
for i in range(0, iterations):
|
||||
field = np.array(X[species][(i*grid_length):(i*grid_length+grid_length)]
|
||||
).reshape(x_length, y_length)
|
||||
kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state).fit(field.reshape(-1, 1))
|
||||
class_labels = np.append(class_labels.astype(int), kmeans.labels_)
|
||||
|
||||
if ("Class" in X.columns and "Class" in y.columns):
|
||||
print("Class column already exists")
|
||||
else:
|
||||
class_labels_df = pd.DataFrame(class_labels, columns=['Class'])
|
||||
X = pd.concat([X, class_labels_df], axis=1)
|
||||
y = pd.concat([y, class_labels_df], axis=1)
|
||||
self.state["cluster"] = True
|
||||
|
||||
return X, y
|
||||
|
||||
|
||||
def balancer(self, X, y, strategy, sample_fraction=0.5):
|
||||
|
||||
number_features = (X.columns != "Class").sum()
|
||||
if("Class" not in X.columns):
|
||||
if("Class" in y.columns):
|
||||
classes = y['Class']
|
||||
else:
|
||||
raise Exception("No class column found")
|
||||
else:
|
||||
classes = X['Class']
|
||||
counter = classes.value_counts()
|
||||
print("Amount class 0 before:", counter[0] / (counter[0] + counter[1]) )
|
||||
print("Amount class 1 before:", counter[1] / (counter[0] + counter[1]) )
|
||||
df = pd.concat([X.loc[:,X.columns != "Class"], y.loc[:, y.columns != "Class"], classes], axis=1)
|
||||
|
||||
if strategy == 'smote':
|
||||
print("Using SMOTE strategy")
|
||||
smote = SMOTE(sampling_strategy=sample_fraction)
|
||||
df_resampled, classes_resampled = smote.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df. columns == "Class"])
|
||||
|
||||
elif strategy == 'over':
|
||||
print("Using Oversampling")
|
||||
over = RandomOverSampler()
|
||||
df_resampled, classes_resampled = over.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df. columns == "Class"])
|
||||
|
||||
elif strategy == 'under':
|
||||
print("Using Undersampling")
|
||||
under = RandomUnderSampler()
|
||||
df_resampled, classes_resampled = under.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df. columns == "Class"])
|
||||
|
||||
else:
|
||||
return X, y
|
||||
|
||||
counter = classes_resampled["Class"].value_counts()
|
||||
print("Amount class 0 after:", counter[0] / (counter[0] + counter[1]) )
|
||||
print("Amount class 1 after:", counter[1] / (counter[0] + counter[1]) )
|
||||
|
||||
design_resampled = pd.concat([df_resampled.iloc[:,0:number_features], classes_resampled], axis=1)
|
||||
target_resampled = pd.concat([df_resampled.iloc[:,number_features:], classes_resampled], axis=1)
|
||||
|
||||
self.state['balance'] = True
|
||||
return design_resampled, target_resampled
|
||||
|
||||
|
||||
def scale_fit(self, X, y, scaling, type='Standard'):
|
||||
|
||||
if type == 'minmax':
|
||||
self.scaler_X = MinMaxScaler()
|
||||
self.scaler_y = MinMaxScaler()
|
||||
elif type == 'standard':
|
||||
self.scaler_X = StandardScaler()
|
||||
self.scaler_y = StandardScaler()
|
||||
|
||||
else:
|
||||
raise Exception("No valid scaler type found")
|
||||
|
||||
if scaling == 'individual':
|
||||
self.scaler_X.fit(X.iloc[:, X.columns != "Class"])
|
||||
self.scaler_y.fit(y.iloc[:, y.columns != "Class"])
|
||||
|
||||
elif scaling == 'global':
|
||||
self.scaler_X.fit(pd.concat([X.iloc[:, X.columns != "Class"], y.iloc[:, y.columns != "Class"]], axis=0))
|
||||
self.scaler_y = self.scaler_X
|
||||
|
||||
self.state['scale'] = True
|
||||
|
||||
def scale_transform(self, X_train, X_test, y_train, y_test):
|
||||
X_train = pd.concat([self.scaler_X.transform(X_train.loc[:, X_train.columns != "Class"]), X_train.loc[:, "Class"]], axis=1)
|
||||
|
||||
X_test = pd.concat([self.scaler_X.transform(X_test.loc[:, X_test.columns != "Class"]), X_test.loc[:, "Class"]], axis=1)
|
||||
|
||||
y_train = pd.concat([self.scaler_y.transform(y_train.loc[:, y_train.columns != "Class"]), y_train.loc[:, "Class"]], axis=1)
|
||||
|
||||
y_test = pd.concat([self.scaler_y.transform(y_test.loc[:, y_test.columns != "Class"]), y_test.loc[:, "Class"]], axis=1)
|
||||
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
def scale_inverse(self, X):
|
||||
|
||||
if("Class" in X.columns):
|
||||
X = pd.concat([self.scaler_X.inverse_transform(X.loc[:, X.columns != "Class"]), X.loc[:, "Class"]], axis=1)
|
||||
else:
|
||||
X = self.scaler_X.inverse_transform(X)
|
||||
|
||||
return X
|
||||
|
||||
def split(self, X, y, ratio=0.8):
|
||||
X_train, y_train, X_test, y_test = sk.train_test_split(X, y, test_size = ratio, random_state=self.random_state)
|
||||
|
||||
return X_train, y_train, X_test, y_test
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user