mirror of
https://git.gfz-potsdam.de/naaice/model-training.git
synced 2025-12-15 19:58:22 +01:00
Compare commits
4 Commits
f2c89e0b83
...
09a5687580
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
09a5687580 | ||
|
|
bbccd1444d | ||
|
|
e21c7bede8 | ||
|
|
5b520c368d |
BIN
results/adam_history.pkl
(Stored with Git LFS)
BIN
results/adam_history.pkl
(Stored with Git LFS)
Binary file not shown.
BIN
results/rmsprop_history.pkl
(Stored with Git LFS)
BIN
results/rmsprop_history.pkl
(Stored with Git LFS)
Binary file not shown.
BIN
results/sgd_history.pkl
(Stored with Git LFS)
BIN
results/sgd_history.pkl
(Stored with Git LFS)
Binary file not shown.
File diff suppressed because one or more lines are too long
@ -160,38 +160,29 @@ def custom_loss(
|
|||||||
preprocess.scaler_type, scaler_type))
|
preprocess.scaler_type, scaler_type))
|
||||||
|
|
||||||
if scaler_type == "minmax":
|
if scaler_type == "minmax":
|
||||||
scale_X = tf.convert_to_tensor(
|
data_range = tf.convert_to_tensor(
|
||||||
preprocess.scaler_X.data_range_, dtype=tf.float32
|
preprocess.scaler_output.data_range_, dtype=tf.float32
|
||||||
)
|
)
|
||||||
min_X = tf.convert_to_tensor(
|
min_values = tf.convert_to_tensor(
|
||||||
preprocess.scaler_X.data_min_, dtype=tf.float32
|
preprocess.scaler_output.data_min_, dtype=tf.float32
|
||||||
)
|
|
||||||
scale_y = tf.convert_to_tensor(
|
|
||||||
preprocess.scaler_y.data_range_, dtype=tf.float32
|
|
||||||
)
|
|
||||||
min_y = tf.convert_to_tensor(
|
|
||||||
preprocess.scaler_y.data_min_, dtype=tf.float32
|
|
||||||
)
|
)
|
||||||
|
|
||||||
elif scaler_type == "standard":
|
elif scaler_type == "standard":
|
||||||
scale_X = tf.convert_to_tensor(
|
scale_output = tf.convert_to_tensor(
|
||||||
preprocess.scaler_X.scale_, dtype=tf.float32)
|
preprocess.scaler_output.scale_, dtype=tf.float32)
|
||||||
mean_X = tf.convert_to_tensor(
|
mean_output = tf.convert_to_tensor(
|
||||||
preprocess.scaler_X.mean_, dtype=tf.float32)
|
preprocess.scaler_output.mean_, dtype=tf.float32)
|
||||||
scale_y = tf.convert_to_tensor(
|
|
||||||
preprocess.scaler_y.scale_, dtype=tf.float32)
|
|
||||||
mean_y = tf.convert_to_tensor(
|
|
||||||
preprocess.scaler_y.mean_, dtype=tf.float32)
|
|
||||||
|
|
||||||
def loss(results, predicted):
|
def loss(results, predicted):
|
||||||
# inverse min/max scaling
|
# inverse min/max scaling
|
||||||
if scaler_type == "minmax":
|
if scaler_type == "minmax":
|
||||||
predicted_inverse = predicted * scale_y + min_y
|
predicted_inverse = predicted * data_range + min_values
|
||||||
results_inverse = results * scale_X + min_X
|
results_inverse = results * data_range + min_values
|
||||||
|
|
||||||
# inverse standard scaling
|
# inverse standard scaling
|
||||||
elif scaler_type == "standard":
|
elif scaler_type == "standard":
|
||||||
predicted_inverse = predicted * scale_y + mean_y
|
predicted_inverse = predicted * scale_output + mean_output
|
||||||
results_inverse = results * scale_X + mean_X
|
results_inverse = results * scale_output + mean_output
|
||||||
|
|
||||||
elif scaler_type == "none":
|
elif scaler_type == "none":
|
||||||
predicted_inverse = predicted
|
predicted_inverse = predicted
|
||||||
@ -204,6 +195,8 @@ def custom_loss(
|
|||||||
|
|
||||||
# mass balance
|
# mass balance
|
||||||
# in total no Barium and Strontium should be lost in one simulation step
|
# in total no Barium and Strontium should be lost in one simulation step
|
||||||
|
|
||||||
|
# TODO: encapsulate the mass balance terms in a function
|
||||||
dBa = tf.keras.backend.abs(
|
dBa = tf.keras.backend.abs(
|
||||||
(
|
(
|
||||||
predicted_inverse[:, column_dict["Ba"]]
|
predicted_inverse[:, column_dict["Ba"]]
|
||||||
@ -224,6 +217,19 @@ def custom_loss(
|
|||||||
+ results_inverse[:, column_dict["Celestite"]]
|
+ results_inverse[:, column_dict["Celestite"]]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
dS = tf.keras.backend.abs(
|
||||||
|
(
|
||||||
|
predicted_inverse[:, column_dict["S"]]
|
||||||
|
+ predicted_inverse[:, column_dict["Celestite"]]
|
||||||
|
+ predicted_inverse[:, column_dict["Barite"]]
|
||||||
|
)
|
||||||
|
- (
|
||||||
|
results_inverse[:, column_dict["S"]]
|
||||||
|
+ results_inverse[:, column_dict["Celestite"]]
|
||||||
|
+ results_inverse[:, column_dict["Barite"]]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# huber loss
|
# huber loss
|
||||||
huber_loss = tf.keras.losses.Huber(delta)(results, predicted)
|
huber_loss = tf.keras.losses.Huber(delta)(results, predicted)
|
||||||
@ -233,6 +239,8 @@ def custom_loss(
|
|||||||
total_loss = huber_loss
|
total_loss = huber_loss
|
||||||
elif loss_variant == "huber_mass_balance":
|
elif loss_variant == "huber_mass_balance":
|
||||||
total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr
|
total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr
|
||||||
|
elif "huber_mass_balance_extended":
|
||||||
|
total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr + h3 * dS
|
||||||
else:
|
else:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"No valid loss variant found. Choose between 'huber' and 'huber_mass_balance'."
|
"No valid loss variant found. Choose between 'huber' and 'huber_mass_balance'."
|
||||||
@ -243,7 +251,7 @@ def custom_loss(
|
|||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
|
||||||
def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
|
def mass_balance_metric(preprocess, column_dict, scaler_type="minmax", loss_variant="huber_mass_balance"):
|
||||||
"""Auxilary function to calculate the mass balance during training.
|
"""Auxilary function to calculate the mass balance during training.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -256,36 +264,29 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
if scaler_type == "minmax":
|
if scaler_type == "minmax":
|
||||||
scale_X = tf.convert_to_tensor(
|
data_range = tf.convert_to_tensor(
|
||||||
preprocess.scaler_X.data_range_, dtype=tf.float32
|
preprocess.scaler_output.data_range_, dtype=tf.float32
|
||||||
)
|
)
|
||||||
min_X = tf.convert_to_tensor(
|
min_values = tf.convert_to_tensor(
|
||||||
preprocess.scaler_X.data_min_, dtype=tf.float32)
|
preprocess.scaler_output.data_min_, dtype=tf.float32
|
||||||
scale_y = tf.convert_to_tensor(
|
|
||||||
preprocess.scaler_y.data_range_, dtype=tf.float32
|
|
||||||
)
|
)
|
||||||
min_y = tf.convert_to_tensor(
|
|
||||||
preprocess.scaler_y.data_min_, dtype=tf.float32)
|
|
||||||
|
|
||||||
elif scaler_type == "standard":
|
elif scaler_type == "standard":
|
||||||
scale_X = tf.convert_to_tensor(
|
scale_output = tf.convert_to_tensor(
|
||||||
preprocess.scaler_X.scale_, dtype=tf.float32)
|
preprocess.scaler_output.scale_, dtype=tf.float32)
|
||||||
mean_X = tf.convert_to_tensor(
|
mean_output = tf.convert_to_tensor(
|
||||||
preprocess.scaler_X.mean_, dtype=tf.float32)
|
preprocess.scaler_output.mean_, dtype=tf.float32)
|
||||||
scale_y = tf.convert_to_tensor(
|
|
||||||
preprocess.scaler_y.scale_, dtype=tf.float32)
|
|
||||||
mean_y = tf.convert_to_tensor(
|
|
||||||
preprocess.scaler_y.mean_, dtype=tf.float32)
|
|
||||||
|
|
||||||
def mass_balance(results, predicted):
|
def mass_balance(results, predicted):
|
||||||
# inverse min/max scaling
|
# inverse min/max scaling
|
||||||
if scaler_type == "minmax":
|
if scaler_type == "minmax":
|
||||||
predicted_inverse = predicted * scale_y + min_y
|
predicted_inverse = predicted * data_range + min_values
|
||||||
results_inverse = results * scale_X + min_X
|
results_inverse = results * data_range + min_values
|
||||||
|
|
||||||
|
# inverse standard scaling
|
||||||
elif scaler_type == "standard":
|
elif scaler_type == "standard":
|
||||||
predicted_inverse = predicted * scale_y + mean_y
|
predicted_inverse = predicted * scale_output + mean_output
|
||||||
results_inverse = results * scale_X + mean_X
|
results_inverse = results * scale_output + mean_output
|
||||||
|
|
||||||
elif scaler_type == "none":
|
elif scaler_type == "none":
|
||||||
predicted_inverse = predicted
|
predicted_inverse = predicted
|
||||||
@ -306,6 +307,7 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
|
|||||||
+ results_inverse[:, column_dict["Barite"]]
|
+ results_inverse[:, column_dict["Barite"]]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
dSr = tf.keras.backend.abs(
|
dSr = tf.keras.backend.abs(
|
||||||
(
|
(
|
||||||
predicted_inverse[:, column_dict["Sr"]]
|
predicted_inverse[:, column_dict["Sr"]]
|
||||||
@ -316,11 +318,74 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
|
|||||||
+ results_inverse[:, column_dict["Celestite"]]
|
+ results_inverse[:, column_dict["Celestite"]]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return tf.reduce_mean(dBa + dSr)
|
|
||||||
|
dS = tf.keras.backend.abs(
|
||||||
|
(
|
||||||
|
predicted_inverse[:, column_dict["S"]]
|
||||||
|
+ predicted_inverse[:, column_dict["Celestite"]]
|
||||||
|
+ predicted_inverse[:, column_dict["Barite"]]
|
||||||
|
)
|
||||||
|
- (
|
||||||
|
results_inverse[:, column_dict["S"]]
|
||||||
|
+ results_inverse[:, column_dict["Celestite"]]
|
||||||
|
+ results_inverse[:, column_dict["Barite"]]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if loss_variant == "huber_mass_balance":
|
||||||
|
return tf.reduce_mean(dBa + dSr)
|
||||||
|
elif loss_variant == "huber_mass_balance_extended":
|
||||||
|
return tf.reduce_mean(dBa + dSr + dS)
|
||||||
|
|
||||||
return mass_balance
|
return mass_balance
|
||||||
|
|
||||||
|
|
||||||
|
# def mass_balance_barium(predicted_inverse, results_inverse, column_dict):
|
||||||
|
# dBa = tf.keras.backend.abs(
|
||||||
|
# (
|
||||||
|
# predicted_inverse[:, column_dict["Ba"]]
|
||||||
|
# + predicted_inverse[:, column_dict["Barite"]]
|
||||||
|
# )
|
||||||
|
# - (
|
||||||
|
# results_inverse[:, column_dict["Ba"]]
|
||||||
|
# + results_inverse[:, column_dict["Barite"]]
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
|
||||||
|
# return dBa
|
||||||
|
|
||||||
|
|
||||||
|
# def mass_balance_strontium(predicted_inverse, results_inverse, column_dict):
|
||||||
|
# dSr = tf.keras.backend.abs(
|
||||||
|
# (
|
||||||
|
# predicted_inverse[:, column_dict["Sr"]]
|
||||||
|
# + predicted_inverse[:, column_dict["Celestite"]]
|
||||||
|
# )
|
||||||
|
# - (
|
||||||
|
# results_inverse[:, column_dict["Sr"]]
|
||||||
|
# + results_inverse[:, column_dict["Celestite"]]
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
|
||||||
|
# return dSr
|
||||||
|
|
||||||
|
# def mass_balance_sulfur(predicted_inverse, results_inverse, column_dict):
|
||||||
|
# dS = tf.keras.backend.abs(
|
||||||
|
# (
|
||||||
|
# predicted_inverse[:, column_dict["S"]]
|
||||||
|
# + predicted_inverse[:, column_dict["Celestite"]]
|
||||||
|
# + predicted_inverse[:, column_dict["Barite"]]
|
||||||
|
# )
|
||||||
|
# - (
|
||||||
|
# results_inverse[:, column_dict["S"]]
|
||||||
|
# + results_inverse[:, column_dict["Celestite"]]
|
||||||
|
# + results_inverse[:, column_dict["Barite"]]
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
|
||||||
|
# return dS
|
||||||
|
|
||||||
|
|
||||||
def huber_metric(delta=1.0):
|
def huber_metric(delta=1.0):
|
||||||
"""Auxilary function to calculate the Huber loss during training.
|
"""Auxilary function to calculate the Huber loss during training.
|
||||||
|
|
||||||
@ -337,8 +402,9 @@ def huber_metric(delta=1.0):
|
|||||||
return huber
|
return huber
|
||||||
|
|
||||||
|
|
||||||
def mass_balance_evaluation(model, X, y, preprocess):
|
def mass_balance_evaluation(model, X, preprocess):
|
||||||
"""Calculates the mass balance difference for each cell.
|
"""Calculates the mass balance difference for each cell
|
||||||
|
between the predicted values and the design dataset.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model: trained model
|
model: trained model
|
||||||
@ -353,18 +419,12 @@ def mass_balance_evaluation(model, X, y, preprocess):
|
|||||||
columns = X.iloc[:, X.columns != "Class"].columns
|
columns = X.iloc[:, X.columns != "Class"].columns
|
||||||
classes = X["Class"]
|
classes = X["Class"]
|
||||||
classes.reset_index(drop=True, inplace=True)
|
classes.reset_index(drop=True, inplace=True)
|
||||||
prediction = pd.DataFrame(model.predict(X[columns]), columns=y.columns)
|
prediction = pd.DataFrame(model.predict(X[columns]), columns=preprocess.scaler_output.feature_names_in_)
|
||||||
|
|
||||||
# backtransform min/max or standard scaler
|
# backtransform min/max or standard scaler
|
||||||
|
if preprocess.scaler_input is not None:
|
||||||
if preprocess.scaler_X is not None:
|
X = preprocess.scale_inverse(X)[0]
|
||||||
X = pd.DataFrame(
|
prediction = preprocess.scale_inverse(prediction)[0]
|
||||||
preprocess.scaler_X.inverse_transform(
|
|
||||||
X.iloc[:, X.columns != "Class"]),
|
|
||||||
columns=columns,
|
|
||||||
)
|
|
||||||
prediction = pd.DataFrame(
|
|
||||||
preprocess.scaler_y.inverse_transform(prediction), columns=columns
|
|
||||||
)
|
|
||||||
|
|
||||||
# apply backtransformation if log transformation was applied
|
# apply backtransformation if log transformation was applied
|
||||||
if preprocess.func_dict_out is not None:
|
if preprocess.func_dict_out is not None:
|
||||||
@ -378,9 +438,12 @@ def mass_balance_evaluation(model, X, y, preprocess):
|
|||||||
(prediction["Sr"] + prediction["Celestite"]) -
|
(prediction["Sr"] + prediction["Celestite"]) -
|
||||||
(X["Sr"] + X["Celestite"])
|
(X["Sr"] + X["Celestite"])
|
||||||
)
|
)
|
||||||
|
dS = np.abs(
|
||||||
|
(prediction["S"] + prediction["Celestite"] + prediction["Barite"]) -
|
||||||
|
(X["S"] + X["Celestite"] + X["Barite"]))
|
||||||
|
|
||||||
mass_balance_result = pd.DataFrame(
|
mass_balance_result = pd.DataFrame(
|
||||||
{"dBa": dBa, "dSr": dSr, "mass_balance": dBa + dSr, "Class": classes}
|
{"dBa": dBa, "dSr": dSr, "dS": dS, "mass_balance": dBa + dSr, "mass_balance_extended": dBa+dSr+dS, "Class": classes}
|
||||||
)
|
)
|
||||||
|
|
||||||
return mass_balance_result
|
return mass_balance_result
|
||||||
@ -421,8 +484,8 @@ class preprocessing:
|
|||||||
random_state (int, optional): Seed for reproducability. Defaults to 42.
|
random_state (int, optional): Seed for reproducability. Defaults to 42.
|
||||||
"""
|
"""
|
||||||
self.random_state = random_state
|
self.random_state = random_state
|
||||||
self.scaler_X = None
|
self.scaler_input = None
|
||||||
self.scaler_y = None
|
self.scaler_output = None
|
||||||
self.func_dict_in = func_dict_in if func_dict_in is not None else None
|
self.func_dict_in = func_dict_in if func_dict_in is not None else None
|
||||||
self.func_dict_out = func_dict_out if func_dict_out is not None else None
|
self.func_dict_out = func_dict_out if func_dict_out is not None else None
|
||||||
self.state = {"cluster": False, "log": False,
|
self.state = {"cluster": False, "log": False,
|
||||||
@ -500,8 +563,10 @@ class preprocessing:
|
|||||||
|
|
||||||
label = np.zeros(len(X))
|
label = np.zeros(len(X))
|
||||||
label[X[species] > threshold] = 1
|
label[X[species] > threshold] = 1
|
||||||
X["Class"] = label
|
X = X.copy()
|
||||||
y["Class"] = label
|
y = y.copy()
|
||||||
|
X.loc[:, "Class"] = label
|
||||||
|
y.loc[:, "Class"] = label
|
||||||
|
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
@ -584,52 +649,47 @@ class preprocessing:
|
|||||||
self.state["balance"] = True
|
self.state["balance"] = True
|
||||||
return design_resampled, target_resampled
|
return design_resampled, target_resampled
|
||||||
|
|
||||||
def scale_fit(self, X, y, scaling, type="standard"):
|
def scale_fit(self, X, y, type="standard"):
|
||||||
self.scaler_type = type
|
self.scaler_type = type
|
||||||
self.scaler_scope = scaling
|
|
||||||
"""Fit a scaler for data preprocessing.
|
"""Fit a scaler for data preprocessing.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
X: design dataset
|
X: design dataset
|
||||||
y: target dataset
|
y: target dataset
|
||||||
scaling: learn individual scaler for X and y when "individual" is selected or one global scaler on all data in X and y if "global" is selected (scaler_X and scaler_y are equal)
|
scaling: fit a scaler on all data in X and y. If X and y have different dimensions
|
||||||
|
input and output scaler are trained for the specific columns.
|
||||||
type (str, optional): Using MinMax Scaling or Standarization. Defaults to "Standard".
|
type (str, optional): Using MinMax Scaling or Standarization. Defaults to "Standard".
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if type == "minmax":
|
if type == "minmax":
|
||||||
self.scaler_X = MinMaxScaler()
|
self.scaler_input = MinMaxScaler()
|
||||||
self.scaler_y = MinMaxScaler()
|
self.scaler_output = MinMaxScaler()
|
||||||
elif type == "standard":
|
elif type == "standard":
|
||||||
self.scaler_X = StandardScaler()
|
self.scaler_input = StandardScaler()
|
||||||
self.scaler_y = StandardScaler()
|
self.scaler_output = StandardScaler()
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise Exception("No valid scaler type found")
|
raise Exception("No valid scaler type found")
|
||||||
|
|
||||||
if scaling == "individual":
|
|
||||||
self.scaler_X.fit(X.iloc[:, X.columns != "Class"])
|
|
||||||
self.scaler_y.fit(y.iloc[:, y.columns != "Class"])
|
|
||||||
|
|
||||||
elif scaling == "global":
|
all_data = pd.concat([X, y],axis=0)
|
||||||
self.scaler_X.fit(
|
|
||||||
pd.concat(
|
if len(X.columns) == len(y.columns):
|
||||||
[X.iloc[:, X.columns != "Class"],
|
self.scaler_input.fit(all_data.loc[:, X.columns != "Class"])
|
||||||
y.iloc[:, y.columns != "Class"]],
|
self.scaler_output = self.scaler_input
|
||||||
axis=0,
|
|
||||||
)
|
else:
|
||||||
)
|
self.scaler_input.fit(all_data[X.columns[X.columns != "Class"]])
|
||||||
self.scaler_y = self.scaler_X
|
self.scaler_output.fit(all_data[y.columns[y.columns != "Class"]])
|
||||||
|
|
||||||
|
|
||||||
self.state["scale"] = True
|
self.state["scale"] = True
|
||||||
|
|
||||||
|
|
||||||
return pd.concat(
|
|
||||||
[X.iloc[:, X.columns != "Class"],
|
|
||||||
y.iloc[:, y.columns != "Class"]],
|
|
||||||
axis=0,
|
|
||||||
)
|
|
||||||
|
|
||||||
def scale_transform(self, X_train, X_test, y_train, y_test):
|
def scale_transform(self, *args):
|
||||||
"""Apply learned scaler on datasets.
|
"""Apply learned scaler on datasets.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -641,82 +701,31 @@ class preprocessing:
|
|||||||
Returns:
|
Returns:
|
||||||
transformed dataframes
|
transformed dataframes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if self.scaler_scope == "global":
|
results = []
|
||||||
if len(X_train.columns) > len(y_train.columns):
|
for i in args:
|
||||||
y_train_modified = X_train.copy()
|
# check which scaler should be used depending on the columns
|
||||||
y_test_modified = X_test.copy()
|
if len(i.columns[i.columns != "Class"]) == len(self.scaler_input.feature_names_in_):
|
||||||
|
scaler = self.scaler_input
|
||||||
for i in y_train_modified.columns:
|
|
||||||
if i in y_train.columns:
|
|
||||||
y_train_modified[i] = y_train[i]
|
|
||||||
y_test_modified[i] = y_test[i]
|
|
||||||
else:
|
|
||||||
y_train_modified[i] = np.nan
|
|
||||||
y_test_modified[i] = np.nan
|
|
||||||
|
|
||||||
y_train = y_train_modified
|
|
||||||
y_test = y_test_modified
|
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
X_train_modified = y_train.copy()
|
scaler = self.scaler_output
|
||||||
X_test_modified = y_test.copy()
|
|
||||||
|
if "Class" in i.columns:
|
||||||
for i in X_train_modified.columns:
|
i = pd.concat(
|
||||||
if i in X_train.columns:
|
[
|
||||||
X_train_modified[i] = X_train[i]
|
scaler.transform(i.loc[:, i.columns != "Class"]),
|
||||||
X_test_modified[i] = X_test[i]
|
i.loc[:, "Class"],
|
||||||
else:
|
],
|
||||||
X_train_modified[i] = np.nan
|
axis=1,
|
||||||
X_test_modified[i] = np.nan
|
)
|
||||||
|
else:
|
||||||
X_train = X_train_modified
|
i = scaler.transform(i)
|
||||||
X_test = X_test_modified
|
|
||||||
|
results.append(i)
|
||||||
|
|
||||||
X_train = pd.concat(
|
|
||||||
[
|
return results
|
||||||
self.scaler_X.transform(
|
|
||||||
X_train.loc[:, X_train.columns != "Class"]),
|
|
||||||
X_train.loc[:, "Class"],
|
|
||||||
],
|
|
||||||
axis=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
X_test = pd.concat(
|
|
||||||
[
|
|
||||||
self.scaler_X.transform(
|
|
||||||
X_test.loc[:, X_test.columns != "Class"]),
|
|
||||||
X_test.loc[:, "Class"],
|
|
||||||
],
|
|
||||||
axis=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
y_train = pd.concat(
|
|
||||||
[
|
|
||||||
self.scaler_y.transform(
|
|
||||||
y_train.loc[:, y_train.columns != "Class"]),
|
|
||||||
y_train.loc[:, "Class"],
|
|
||||||
],
|
|
||||||
axis=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
y_test = pd.concat(
|
|
||||||
[
|
|
||||||
self.scaler_y.transform(
|
|
||||||
y_test.loc[:, y_test.columns != "Class"]),
|
|
||||||
y_test.loc[:, "Class"],
|
|
||||||
],
|
|
||||||
axis=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
X_train.dropna(axis=1, inplace=True)
|
|
||||||
X_test.dropna(axis=1, inplace=True)
|
|
||||||
y_train.dropna(axis=1, inplace=True)
|
|
||||||
y_test.dropna(axis=1, inplace=True)
|
|
||||||
|
|
||||||
return X_train, X_test, y_train, y_test
|
|
||||||
|
|
||||||
def scale_inverse(self, *args):
|
def scale_inverse(self, *args):
|
||||||
"""Backtransform the dataset
|
"""Backtransform the dataset
|
||||||
@ -725,65 +734,28 @@ class preprocessing:
|
|||||||
Backtransformed data frames
|
Backtransformed data frames
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
result = []
|
result = []
|
||||||
|
|
||||||
|
|
||||||
if self.scaler_scope == "individual":
|
for i in args:
|
||||||
for i in args:
|
# check which scaler should be used depending on the columns
|
||||||
if(len(i.columns) == len(self.scaler_X.feature_names_in_)):
|
if len(i.columns[i.columns != "Class"]) == len(self.scaler_input.feature_names_in_):
|
||||||
scaler = self.scaler_X
|
scaler = self.scaler_input
|
||||||
else:
|
else:
|
||||||
scaler = self.scaler_y
|
scaler = self.scaler_output
|
||||||
if "Class" in i.columns:
|
|
||||||
inversed = pd.DataFrame(
|
if "Class" in i.columns:
|
||||||
scaler.inverse_transform(
|
inversed = pd.DataFrame(
|
||||||
i.loc[:, i.columns != "Class"]),
|
scaler.inverse_transform(
|
||||||
columns=i.columns[:-1],
|
i.loc[:, i.columns != "Class"]),
|
||||||
)
|
columns=i.columns[:-1],
|
||||||
class_column = i.loc[:, "Class"].reset_index(drop=True)
|
)
|
||||||
i = pd.concat([inversed, class_column], axis=1)
|
class_column = i.loc[:, "Class"].reset_index(drop=True)
|
||||||
else:
|
i = pd.concat([inversed, class_column], axis=1)
|
||||||
|
else:
|
||||||
i = pd.DataFrame(
|
i = pd.DataFrame(
|
||||||
scaler.inverse_transform(i), columns=i.columns)
|
scaler.inverse_transform(i), columns=i.columns)
|
||||||
result.append(i)
|
result.append(i)
|
||||||
|
|
||||||
elif self.scaler_scope == "global":
|
|
||||||
|
|
||||||
for i in args:
|
|
||||||
if (len(i.columns) == len(self.preprocess.scaler_X.feature_names_in_)):
|
|
||||||
if "Class" in i.columns:
|
|
||||||
inversed = pd.DataFrame(
|
|
||||||
self.scaler_X.inverse_transform(
|
|
||||||
i.loc[:, i.columns != "Class"]),
|
|
||||||
columns=i.columns[:-1],
|
|
||||||
)
|
|
||||||
class_column = i.loc[:, "Class"].reset_index(drop=True)
|
|
||||||
i = pd.concat([inversed, class_column], axis=1)
|
|
||||||
else:
|
|
||||||
i = pd.DataFrame(
|
|
||||||
self.scaler_X.inverse_transform(i), columns=i.columns)
|
|
||||||
result.append(i)
|
|
||||||
|
|
||||||
else:
|
|
||||||
df = pd.DataFrame()
|
|
||||||
for j in self.scaler_X.feature_names_in_:
|
|
||||||
if j in i.columns:
|
|
||||||
df[j] = i[j]
|
|
||||||
else:
|
|
||||||
df[j] = np.nan
|
|
||||||
if "Class" in i.columns:
|
|
||||||
inversed = pd.DataFrame(
|
|
||||||
self.scaler_X.inverse_transform(
|
|
||||||
df.loc[:, df.columns != "Class"]),
|
|
||||||
columns=df.columns[:-1],
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
i = pd.DataFrame(
|
|
||||||
self.scaler_X.inverse_transform(df), columns=df.columns)
|
|
||||||
result.append(i)
|
|
||||||
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
14
tests/test_scaler.py
Normal file
14
tests/test_scaler.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# import unittest
|
||||||
|
# import os
|
||||||
|
|
||||||
|
# os.chdir("../src/")
|
||||||
|
# print(os.getcwd())
|
||||||
|
|
||||||
|
# from preprocessing import *
|
||||||
|
|
||||||
|
# class TestScaler(unittest.TestCase):
|
||||||
|
# def sample_test(self):
|
||||||
|
# self.assertEqual(1, 1)
|
||||||
|
|
||||||
|
# if __name__ == '__main__':
|
||||||
|
# unittest.main()
|
||||||
Loading…
x
Reference in New Issue
Block a user