Compare commits

...

4 Commits

Author SHA1 Message Date
Hannes Signer
09a5687580 delete old files 2025-03-27 15:12:56 +01:00
Hannes Signer
bbccd1444d update preprocessing 2025-03-27 14:59:52 +01:00
Hannes Signer
e21c7bede8 add template for test cases 2025-03-27 14:52:32 +01:00
Hannes Signer
5b520c368d adapt scaler for different number of input and output features 2025-03-27 14:52:18 +01:00
6 changed files with 410 additions and 1335 deletions

BIN
results/adam_history.pkl (Stored with Git LFS)

Binary file not shown.

BIN
results/rmsprop_history.pkl (Stored with Git LFS)

Binary file not shown.

BIN
results/sgd_history.pkl (Stored with Git LFS)

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -160,38 +160,29 @@ def custom_loss(
preprocess.scaler_type, scaler_type))
if scaler_type == "minmax":
scale_X = tf.convert_to_tensor(
preprocess.scaler_X.data_range_, dtype=tf.float32
data_range = tf.convert_to_tensor(
preprocess.scaler_output.data_range_, dtype=tf.float32
)
min_X = tf.convert_to_tensor(
preprocess.scaler_X.data_min_, dtype=tf.float32
)
scale_y = tf.convert_to_tensor(
preprocess.scaler_y.data_range_, dtype=tf.float32
)
min_y = tf.convert_to_tensor(
preprocess.scaler_y.data_min_, dtype=tf.float32
min_values = tf.convert_to_tensor(
preprocess.scaler_output.data_min_, dtype=tf.float32
)
elif scaler_type == "standard":
scale_X = tf.convert_to_tensor(
preprocess.scaler_X.scale_, dtype=tf.float32)
mean_X = tf.convert_to_tensor(
preprocess.scaler_X.mean_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(
preprocess.scaler_y.scale_, dtype=tf.float32)
mean_y = tf.convert_to_tensor(
preprocess.scaler_y.mean_, dtype=tf.float32)
scale_output = tf.convert_to_tensor(
preprocess.scaler_output.scale_, dtype=tf.float32)
mean_output = tf.convert_to_tensor(
preprocess.scaler_output.mean_, dtype=tf.float32)
def loss(results, predicted):
# inverse min/max scaling
if scaler_type == "minmax":
predicted_inverse = predicted * scale_y + min_y
results_inverse = results * scale_X + min_X
predicted_inverse = predicted * data_range + min_values
results_inverse = results * data_range + min_values
# inverse standard scaling
elif scaler_type == "standard":
predicted_inverse = predicted * scale_y + mean_y
results_inverse = results * scale_X + mean_X
predicted_inverse = predicted * scale_output + mean_output
results_inverse = results * scale_output + mean_output
elif scaler_type == "none":
predicted_inverse = predicted
@ -204,6 +195,8 @@ def custom_loss(
# mass balance
# in total no Barium and Strontium should be lost in one simulation step
# TODO: encapsulate the mass balance terms in a function
dBa = tf.keras.backend.abs(
(
predicted_inverse[:, column_dict["Ba"]]
@ -224,6 +217,19 @@ def custom_loss(
+ results_inverse[:, column_dict["Celestite"]]
)
)
dS = tf.keras.backend.abs(
(
predicted_inverse[:, column_dict["S"]]
+ predicted_inverse[:, column_dict["Celestite"]]
+ predicted_inverse[:, column_dict["Barite"]]
)
- (
results_inverse[:, column_dict["S"]]
+ results_inverse[:, column_dict["Celestite"]]
+ results_inverse[:, column_dict["Barite"]]
)
)
# huber loss
huber_loss = tf.keras.losses.Huber(delta)(results, predicted)
@ -233,6 +239,8 @@ def custom_loss(
total_loss = huber_loss
elif loss_variant == "huber_mass_balance":
total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr
elif "huber_mass_balance_extended":
total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr + h3 * dS
else:
raise Exception(
"No valid loss variant found. Choose between 'huber' and 'huber_mass_balance'."
@ -243,7 +251,7 @@ def custom_loss(
return loss
def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
def mass_balance_metric(preprocess, column_dict, scaler_type="minmax", loss_variant="huber_mass_balance"):
"""Auxilary function to calculate the mass balance during training.
Args:
@ -256,36 +264,29 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
"""
if scaler_type == "minmax":
scale_X = tf.convert_to_tensor(
preprocess.scaler_X.data_range_, dtype=tf.float32
data_range = tf.convert_to_tensor(
preprocess.scaler_output.data_range_, dtype=tf.float32
)
min_X = tf.convert_to_tensor(
preprocess.scaler_X.data_min_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(
preprocess.scaler_y.data_range_, dtype=tf.float32
min_values = tf.convert_to_tensor(
preprocess.scaler_output.data_min_, dtype=tf.float32
)
min_y = tf.convert_to_tensor(
preprocess.scaler_y.data_min_, dtype=tf.float32)
elif scaler_type == "standard":
scale_X = tf.convert_to_tensor(
preprocess.scaler_X.scale_, dtype=tf.float32)
mean_X = tf.convert_to_tensor(
preprocess.scaler_X.mean_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(
preprocess.scaler_y.scale_, dtype=tf.float32)
mean_y = tf.convert_to_tensor(
preprocess.scaler_y.mean_, dtype=tf.float32)
scale_output = tf.convert_to_tensor(
preprocess.scaler_output.scale_, dtype=tf.float32)
mean_output = tf.convert_to_tensor(
preprocess.scaler_output.mean_, dtype=tf.float32)
def mass_balance(results, predicted):
# inverse min/max scaling
if scaler_type == "minmax":
predicted_inverse = predicted * scale_y + min_y
results_inverse = results * scale_X + min_X
predicted_inverse = predicted * data_range + min_values
results_inverse = results * data_range + min_values
# inverse standard scaling
elif scaler_type == "standard":
predicted_inverse = predicted * scale_y + mean_y
results_inverse = results * scale_X + mean_X
predicted_inverse = predicted * scale_output + mean_output
results_inverse = results * scale_output + mean_output
elif scaler_type == "none":
predicted_inverse = predicted
@ -306,6 +307,7 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
+ results_inverse[:, column_dict["Barite"]]
)
)
dSr = tf.keras.backend.abs(
(
predicted_inverse[:, column_dict["Sr"]]
@ -316,11 +318,74 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
+ results_inverse[:, column_dict["Celestite"]]
)
)
return tf.reduce_mean(dBa + dSr)
dS = tf.keras.backend.abs(
(
predicted_inverse[:, column_dict["S"]]
+ predicted_inverse[:, column_dict["Celestite"]]
+ predicted_inverse[:, column_dict["Barite"]]
)
- (
results_inverse[:, column_dict["S"]]
+ results_inverse[:, column_dict["Celestite"]]
+ results_inverse[:, column_dict["Barite"]]
)
)
if loss_variant == "huber_mass_balance":
return tf.reduce_mean(dBa + dSr)
elif loss_variant == "huber_mass_balance_extended":
return tf.reduce_mean(dBa + dSr + dS)
return mass_balance
# def mass_balance_barium(predicted_inverse, results_inverse, column_dict):
# dBa = tf.keras.backend.abs(
# (
# predicted_inverse[:, column_dict["Ba"]]
# + predicted_inverse[:, column_dict["Barite"]]
# )
# - (
# results_inverse[:, column_dict["Ba"]]
# + results_inverse[:, column_dict["Barite"]]
# )
# )
# return dBa
# def mass_balance_strontium(predicted_inverse, results_inverse, column_dict):
# dSr = tf.keras.backend.abs(
# (
# predicted_inverse[:, column_dict["Sr"]]
# + predicted_inverse[:, column_dict["Celestite"]]
# )
# - (
# results_inverse[:, column_dict["Sr"]]
# + results_inverse[:, column_dict["Celestite"]]
# )
# )
# return dSr
# def mass_balance_sulfur(predicted_inverse, results_inverse, column_dict):
# dS = tf.keras.backend.abs(
# (
# predicted_inverse[:, column_dict["S"]]
# + predicted_inverse[:, column_dict["Celestite"]]
# + predicted_inverse[:, column_dict["Barite"]]
# )
# - (
# results_inverse[:, column_dict["S"]]
# + results_inverse[:, column_dict["Celestite"]]
# + results_inverse[:, column_dict["Barite"]]
# )
# )
# return dS
def huber_metric(delta=1.0):
"""Auxilary function to calculate the Huber loss during training.
@ -337,8 +402,9 @@ def huber_metric(delta=1.0):
return huber
def mass_balance_evaluation(model, X, y, preprocess):
"""Calculates the mass balance difference for each cell.
def mass_balance_evaluation(model, X, preprocess):
"""Calculates the mass balance difference for each cell
between the predicted values and the design dataset.
Args:
model: trained model
@ -353,18 +419,12 @@ def mass_balance_evaluation(model, X, y, preprocess):
columns = X.iloc[:, X.columns != "Class"].columns
classes = X["Class"]
classes.reset_index(drop=True, inplace=True)
prediction = pd.DataFrame(model.predict(X[columns]), columns=y.columns)
prediction = pd.DataFrame(model.predict(X[columns]), columns=preprocess.scaler_output.feature_names_in_)
# backtransform min/max or standard scaler
if preprocess.scaler_X is not None:
X = pd.DataFrame(
preprocess.scaler_X.inverse_transform(
X.iloc[:, X.columns != "Class"]),
columns=columns,
)
prediction = pd.DataFrame(
preprocess.scaler_y.inverse_transform(prediction), columns=columns
)
if preprocess.scaler_input is not None:
X = preprocess.scale_inverse(X)[0]
prediction = preprocess.scale_inverse(prediction)[0]
# apply backtransformation if log transformation was applied
if preprocess.func_dict_out is not None:
@ -378,9 +438,12 @@ def mass_balance_evaluation(model, X, y, preprocess):
(prediction["Sr"] + prediction["Celestite"]) -
(X["Sr"] + X["Celestite"])
)
dS = np.abs(
(prediction["S"] + prediction["Celestite"] + prediction["Barite"]) -
(X["S"] + X["Celestite"] + X["Barite"]))
mass_balance_result = pd.DataFrame(
{"dBa": dBa, "dSr": dSr, "mass_balance": dBa + dSr, "Class": classes}
{"dBa": dBa, "dSr": dSr, "dS": dS, "mass_balance": dBa + dSr, "mass_balance_extended": dBa+dSr+dS, "Class": classes}
)
return mass_balance_result
@ -421,8 +484,8 @@ class preprocessing:
random_state (int, optional): Seed for reproducability. Defaults to 42.
"""
self.random_state = random_state
self.scaler_X = None
self.scaler_y = None
self.scaler_input = None
self.scaler_output = None
self.func_dict_in = func_dict_in if func_dict_in is not None else None
self.func_dict_out = func_dict_out if func_dict_out is not None else None
self.state = {"cluster": False, "log": False,
@ -500,8 +563,10 @@ class preprocessing:
label = np.zeros(len(X))
label[X[species] > threshold] = 1
X["Class"] = label
y["Class"] = label
X = X.copy()
y = y.copy()
X.loc[:, "Class"] = label
y.loc[:, "Class"] = label
return X, y
@ -584,52 +649,47 @@ class preprocessing:
self.state["balance"] = True
return design_resampled, target_resampled
def scale_fit(self, X, y, scaling, type="standard"):
def scale_fit(self, X, y, type="standard"):
self.scaler_type = type
self.scaler_scope = scaling
"""Fit a scaler for data preprocessing.
Args:
X: design dataset
y: target dataset
scaling: learn individual scaler for X and y when "individual" is selected or one global scaler on all data in X and y if "global" is selected (scaler_X and scaler_y are equal)
scaling: fit a scaler on all data in X and y. If X and y have different dimensions
input and output scaler are trained for the specific columns.
type (str, optional): Using MinMax Scaling or Standarization. Defaults to "Standard".
"""
if type == "minmax":
self.scaler_X = MinMaxScaler()
self.scaler_y = MinMaxScaler()
self.scaler_input = MinMaxScaler()
self.scaler_output = MinMaxScaler()
elif type == "standard":
self.scaler_X = StandardScaler()
self.scaler_y = StandardScaler()
self.scaler_input = StandardScaler()
self.scaler_output = StandardScaler()
else:
raise Exception("No valid scaler type found")
if scaling == "individual":
self.scaler_X.fit(X.iloc[:, X.columns != "Class"])
self.scaler_y.fit(y.iloc[:, y.columns != "Class"])
elif scaling == "global":
self.scaler_X.fit(
pd.concat(
[X.iloc[:, X.columns != "Class"],
y.iloc[:, y.columns != "Class"]],
axis=0,
)
)
self.scaler_y = self.scaler_X
all_data = pd.concat([X, y],axis=0)
if len(X.columns) == len(y.columns):
self.scaler_input.fit(all_data.loc[:, X.columns != "Class"])
self.scaler_output = self.scaler_input
else:
self.scaler_input.fit(all_data[X.columns[X.columns != "Class"]])
self.scaler_output.fit(all_data[y.columns[y.columns != "Class"]])
self.state["scale"] = True
return pd.concat(
[X.iloc[:, X.columns != "Class"],
y.iloc[:, y.columns != "Class"]],
axis=0,
)
def scale_transform(self, X_train, X_test, y_train, y_test):
def scale_transform(self, *args):
"""Apply learned scaler on datasets.
Args:
@ -641,82 +701,31 @@ class preprocessing:
Returns:
transformed dataframes
"""
if self.scaler_scope == "global":
if len(X_train.columns) > len(y_train.columns):
y_train_modified = X_train.copy()
y_test_modified = X_test.copy()
for i in y_train_modified.columns:
if i in y_train.columns:
y_train_modified[i] = y_train[i]
y_test_modified[i] = y_test[i]
else:
y_train_modified[i] = np.nan
y_test_modified[i] = np.nan
y_train = y_train_modified
y_test = y_test_modified
results = []
for i in args:
# check which scaler should be used depending on the columns
if len(i.columns[i.columns != "Class"]) == len(self.scaler_input.feature_names_in_):
scaler = self.scaler_input
else:
X_train_modified = y_train.copy()
X_test_modified = y_test.copy()
for i in X_train_modified.columns:
if i in X_train.columns:
X_train_modified[i] = X_train[i]
X_test_modified[i] = X_test[i]
else:
X_train_modified[i] = np.nan
X_test_modified[i] = np.nan
X_train = X_train_modified
X_test = X_test_modified
X_train = pd.concat(
[
self.scaler_X.transform(
X_train.loc[:, X_train.columns != "Class"]),
X_train.loc[:, "Class"],
],
axis=1,
)
X_test = pd.concat(
[
self.scaler_X.transform(
X_test.loc[:, X_test.columns != "Class"]),
X_test.loc[:, "Class"],
],
axis=1,
)
y_train = pd.concat(
[
self.scaler_y.transform(
y_train.loc[:, y_train.columns != "Class"]),
y_train.loc[:, "Class"],
],
axis=1,
)
y_test = pd.concat(
[
self.scaler_y.transform(
y_test.loc[:, y_test.columns != "Class"]),
y_test.loc[:, "Class"],
],
axis=1,
)
X_train.dropna(axis=1, inplace=True)
X_test.dropna(axis=1, inplace=True)
y_train.dropna(axis=1, inplace=True)
y_test.dropna(axis=1, inplace=True)
return X_train, X_test, y_train, y_test
scaler = self.scaler_output
if "Class" in i.columns:
i = pd.concat(
[
scaler.transform(i.loc[:, i.columns != "Class"]),
i.loc[:, "Class"],
],
axis=1,
)
else:
i = scaler.transform(i)
results.append(i)
return results
def scale_inverse(self, *args):
"""Backtransform the dataset
@ -725,65 +734,28 @@ class preprocessing:
Backtransformed data frames
"""
result = []
if self.scaler_scope == "individual":
for i in args:
if(len(i.columns) == len(self.scaler_X.feature_names_in_)):
scaler = self.scaler_X
else:
scaler = self.scaler_y
if "Class" in i.columns:
inversed = pd.DataFrame(
scaler.inverse_transform(
i.loc[:, i.columns != "Class"]),
columns=i.columns[:-1],
)
class_column = i.loc[:, "Class"].reset_index(drop=True)
i = pd.concat([inversed, class_column], axis=1)
else:
for i in args:
# check which scaler should be used depending on the columns
if len(i.columns[i.columns != "Class"]) == len(self.scaler_input.feature_names_in_):
scaler = self.scaler_input
else:
scaler = self.scaler_output
if "Class" in i.columns:
inversed = pd.DataFrame(
scaler.inverse_transform(
i.loc[:, i.columns != "Class"]),
columns=i.columns[:-1],
)
class_column = i.loc[:, "Class"].reset_index(drop=True)
i = pd.concat([inversed, class_column], axis=1)
else:
i = pd.DataFrame(
scaler.inverse_transform(i), columns=i.columns)
result.append(i)
elif self.scaler_scope == "global":
for i in args:
if (len(i.columns) == len(self.preprocess.scaler_X.feature_names_in_)):
if "Class" in i.columns:
inversed = pd.DataFrame(
self.scaler_X.inverse_transform(
i.loc[:, i.columns != "Class"]),
columns=i.columns[:-1],
)
class_column = i.loc[:, "Class"].reset_index(drop=True)
i = pd.concat([inversed, class_column], axis=1)
else:
i = pd.DataFrame(
self.scaler_X.inverse_transform(i), columns=i.columns)
result.append(i)
else:
df = pd.DataFrame()
for j in self.scaler_X.feature_names_in_:
if j in i.columns:
df[j] = i[j]
else:
df[j] = np.nan
if "Class" in i.columns:
inversed = pd.DataFrame(
self.scaler_X.inverse_transform(
df.loc[:, df.columns != "Class"]),
columns=df.columns[:-1],
)
else:
i = pd.DataFrame(
self.scaler_X.inverse_transform(df), columns=df.columns)
result.append(i)
result.append(i)
return result

14
tests/test_scaler.py Normal file
View File

@ -0,0 +1,14 @@
# import unittest
# import os
# os.chdir("../src/")
# print(os.getcwd())
# from preprocessing import *
# class TestScaler(unittest.TestCase):
# def sample_test(self):
# self.assertEqual(1, 1)
# if __name__ == '__main__':
# unittest.main()