From 0f7ee78a8aaef347cbbbd893d71eafc87a2e86cd Mon Sep 17 00:00:00 2001 From: Hannes Signer Date: Tue, 25 Feb 2025 18:06:56 +0100 Subject: [PATCH] correction of scaling error in mass balance loss --- src/preprocessing.py | 90 ++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 40 deletions(-) diff --git a/src/preprocessing.py b/src/preprocessing.py index 0c0d429..b0b7ef5 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -93,10 +93,10 @@ def custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type="minmax", loss_ # extract the scaling parameters if scaler_type == "minmax": - scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32) - min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32) - scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32) - min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32) + scale_X = tf.convert_to_tensor(preprocess.scaler_X.data_range_, dtype=tf.float32) + min_X = tf.convert_to_tensor(preprocess.scaler_X.data_min_, dtype=tf.float32) + scale_y = tf.convert_to_tensor(preprocess.scaler_y.data_range_, dtype=tf.float32) + min_y = tf.convert_to_tensor(preprocess.scaler_y.data_min_, dtype=tf.float32) elif scaler_type == "standard": scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32) @@ -117,11 +117,11 @@ def custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type="minmax", loss_ results_inverse = results * scale_X + mean_X - # apply exp1m on the columns of predicted_inverse and results_inverse - predicted_inverse = tf.math.expm1(predicted_inverse) - results_inverse = tf.math.expm1(results_inverse) - print(predicted_inverse) - + # apply exp1m on the columns of predicted_inverse and results_inverse if log transformation was applied + if preprocess.func_dict_out is not None: + predicted_inverse = tf.math.expm1(predicted_inverse) + results_inverse = tf.math.expm1(results_inverse) + # mass balance dBa = tf.keras.backend.abs( (predicted_inverse[:, column_dict["Ba"]] + predicted_inverse[:, column_dict["Barite"]]) - @@ -148,10 +148,10 @@ def custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type="minmax", loss_ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"): if scaler_type == "minmax": - scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32) - min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32) - scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32) - min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32) + scale_X = tf.convert_to_tensor(preprocess.scaler_X.data_range_, dtype=tf.float32) + min_X = tf.convert_to_tensor(preprocess.scaler_X.data_min_, dtype=tf.float32) + scale_y = tf.convert_to_tensor(preprocess.scaler_y.data_range_, dtype=tf.float32) + min_y = tf.convert_to_tensor(preprocess.scaler_y.data_min_, dtype=tf.float32) elif scaler_type == "standard": scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32) @@ -169,6 +169,10 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"): elif scaler_type == "standard": predicted_inverse = predicted * scale_y + mean_y results_inverse = results * scale_X + mean_X + + if preprocess.func_dict_out is not None: + predicted_inverse = tf.math.expm1(predicted_inverse) + results_inverse = tf.math.expm1(results_inverse) # mass balance dBa = tf.keras.backend.abs( @@ -201,6 +205,10 @@ def mass_balance_evaluation(model, X, preprocess): # backtransform min/max or standard scaler X = pd.DataFrame(preprocess.scaler_X.inverse_transform(X.iloc[:, X.columns != "Class"]), columns=columns) prediction = pd.DataFrame(preprocess.scaler_y.inverse_transform(prediction), columns=columns) + + # apply exp1m on the columns of predicted_inverse and results_inverse if log transformation was applied + if preprocess.func_dict_out is not None: + X = preprocess.funcInverse(X) # calculate mass balance dBa = np.abs((prediction["Ba"] + prediction["Barite"]) - (X["Ba"] + X["Barite"])) @@ -216,28 +224,27 @@ class preprocessing: self.random_state = random_state self.scaler_X = None self.scaler_y = None - self.func_dict_in = None self.func_dict_in = func_dict_in if func_dict_in is not None else None self.func_dict_out = func_dict_out if func_dict_out is not None else None self.state = {"cluster": False, "log": False, "balance": False, "scale": False} - def funcTranform(self, X, y): - for key in X.keys(): - if "Class" not in key: - X[key] = X[key].apply(self.func_dict_in) - y[key] = y[key].apply(self.func_dict_in) + def funcTranform(self, *args): + + for i in args: + for key in i.keys(): + if "Class" not in key: + i[key] = i[key].apply(self.func_dict_in) self.state["log"] = True + return args - return X, y - - def funcInverse(self, X, y): + def funcInverse(self, *args): - for key in X.keys(): - if "Class" not in key: - X[key] = X[key].apply(self.func_dict_out) - y[key] = y[key].apply(self.func_dict_out) + for i in args: + for key in i.keys(): + if "Class" not in key: + i[key] = i[key].apply(self.func_dict_out) self.state["log"] = False - return X, y + return args def cluster(self, X, y, species='Barite', n_clusters=2, x_length=50, y_length=50): @@ -339,26 +346,29 @@ class preprocessing: return X_train, X_test, y_train, y_test - def scale_inverse(self, X): - - if("Class" in X.columns): - print("Class column found") - X = pd.concat([pd.DataFrame(self.scaler_X.inverse_transform(X.loc[:, X.columns != "Class"]), columns=X.columns[:-1]), X.loc[:, "Class"]], axis=1) - else: - X = self.scaler_X.inverse_transform(X) - - return X + def scale_inverse(self, *args): + result = [] + for i in args: + if "Class" in i.columns: + inversed = pd.DataFrame(self.scaler_X.inverse_transform(i.loc[:, i.columns != "Class"]), columns=i.columns[:-1]) + class_column = i.loc[:, "Class"].reset_index(drop=True) + i = pd.concat([inversed, class_column], axis=1) + else: + i = pd.DataFrame(self.scaler_X.inverse_transform(i), columns=i.columns) + result.append(i) + return result def split(self, X, y, ratio=0.8): X_train, y_train, X_test, y_test = sk.train_test_split(X, y, test_size = ratio, random_state=self.random_state) return X_train, y_train, X_test, y_test - def class_selection(self, X, y, class_label): - X = X[X['Class'] == class_label] - y = y[y['Class'] == class_label] + def class_selection(self, *args, class_label=0): - return X, y + for i in args: + i = i[i['Class'] == class_label] + + return args