From 0f7ee78a8aaef347cbbbd893d71eafc87a2e86cd Mon Sep 17 00:00:00 2001
From: Hannes Signer <signer@uni-potsdam.de>
Date: Tue, 25 Feb 2025 18:06:56 +0100
Subject: [PATCH] correction of scaling error in mass balance loss

---
 src/preprocessing.py | 90 ++++++++++++++++++++++++--------------------
 1 file changed, 50 insertions(+), 40 deletions(-)

diff --git a/src/preprocessing.py b/src/preprocessing.py
index 0c0d429..b0b7ef5 100644
--- a/src/preprocessing.py
+++ b/src/preprocessing.py
@@ -93,10 +93,10 @@ def custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type="minmax", loss_
     # extract the scaling parameters
     
     if scaler_type == "minmax":
-        scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
-        min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32)
-        scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
-        min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32)
+        scale_X = tf.convert_to_tensor(preprocess.scaler_X.data_range_, dtype=tf.float32)
+        min_X = tf.convert_to_tensor(preprocess.scaler_X.data_min_, dtype=tf.float32)
+        scale_y = tf.convert_to_tensor(preprocess.scaler_y.data_range_, dtype=tf.float32)
+        min_y = tf.convert_to_tensor(preprocess.scaler_y.data_min_, dtype=tf.float32)
         
     elif scaler_type == "standard":
         scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
@@ -117,11 +117,11 @@ def custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type="minmax", loss_
             results_inverse = results * scale_X + mean_X
             
         
-        # apply exp1m on the columns of predicted_inverse and results_inverse
-        predicted_inverse = tf.math.expm1(predicted_inverse)
-        results_inverse = tf.math.expm1(results_inverse)
-        print(predicted_inverse)
-
+        # apply exp1m on the columns of predicted_inverse and results_inverse if log transformation was applied
+        if preprocess.func_dict_out is not None:
+            predicted_inverse = tf.math.expm1(predicted_inverse)
+            results_inverse = tf.math.expm1(results_inverse)
+        
         # mass balance
         dBa = tf.keras.backend.abs(
             (predicted_inverse[:, column_dict["Ba"]] + predicted_inverse[:, column_dict["Barite"]]) -
@@ -148,10 +148,10 @@ def custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type="minmax", loss_
 def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
     
     if scaler_type == "minmax":
-        scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
-        min_X = tf.convert_to_tensor(preprocess.scaler_X.min_, dtype=tf.float32)
-        scale_y = tf.convert_to_tensor(preprocess.scaler_y.scale_, dtype=tf.float32)
-        min_y = tf.convert_to_tensor(preprocess.scaler_y.min_, dtype=tf.float32)
+        scale_X = tf.convert_to_tensor(preprocess.scaler_X.data_range_, dtype=tf.float32)
+        min_X = tf.convert_to_tensor(preprocess.scaler_X.data_min_, dtype=tf.float32)
+        scale_y = tf.convert_to_tensor(preprocess.scaler_y.data_range_, dtype=tf.float32)
+        min_y = tf.convert_to_tensor(preprocess.scaler_y.data_min_, dtype=tf.float32)
 
     elif scaler_type == "standard":
         scale_X = tf.convert_to_tensor(preprocess.scaler_X.scale_, dtype=tf.float32)
@@ -169,6 +169,10 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
         elif scaler_type == "standard":
             predicted_inverse = predicted * scale_y + mean_y
             results_inverse = results * scale_X + mean_X
+            
+        if preprocess.func_dict_out is not None:
+            predicted_inverse = tf.math.expm1(predicted_inverse)
+            results_inverse = tf.math.expm1(results_inverse)
 
         # mass balance
         dBa = tf.keras.backend.abs(
@@ -201,6 +205,10 @@ def mass_balance_evaluation(model, X, preprocess):
     # backtransform min/max or standard scaler
     X = pd.DataFrame(preprocess.scaler_X.inverse_transform(X.iloc[:, X.columns != "Class"]), columns=columns)
     prediction = pd.DataFrame(preprocess.scaler_y.inverse_transform(prediction), columns=columns)
+    
+    # apply exp1m on the columns of predicted_inverse and results_inverse if log transformation was applied
+    if preprocess.func_dict_out is not None:
+        X = preprocess.funcInverse(X)
         
     # calculate mass balance
     dBa = np.abs((prediction["Ba"] + prediction["Barite"]) - (X["Ba"] + X["Barite"]))
@@ -216,28 +224,27 @@ class preprocessing:
         self.random_state = random_state
         self.scaler_X = None
         self.scaler_y = None
-        self.func_dict_in = None
         self.func_dict_in = func_dict_in if func_dict_in is not None else None
         self.func_dict_out = func_dict_out if func_dict_out is not None else None
         self.state = {"cluster": False, "log": False, "balance": False, "scale": False}
         
-    def funcTranform(self, X, y):
-        for key in X.keys():   
-            if "Class" not in key:
-                X[key] = X[key].apply(self.func_dict_in)
-                y[key] = y[key].apply(self.func_dict_in)
+    def funcTranform(self, *args):
+        
+        for i in args:
+            for key in i.keys():   
+                if "Class" not in key:
+                    i[key] = i[key].apply(self.func_dict_in)
         self.state["log"] = True
+        return args
         
-        return X, y
-        
-    def funcInverse(self, X, y):
+    def funcInverse(self, *args):
     
-        for key in X.keys():
-            if "Class" not in key:
-                X[key] = X[key].apply(self.func_dict_out)
-                y[key] = y[key].apply(self.func_dict_out)
+        for i in args:
+            for key in i.keys():
+                if "Class" not in key:
+                    i[key] = i[key].apply(self.func_dict_out)
         self.state["log"] = False
-        return X, y
+        return args
         
     def cluster(self, X, y, species='Barite', n_clusters=2, x_length=50, y_length=50):
         
@@ -339,26 +346,29 @@ class preprocessing:
         
         return X_train, X_test, y_train, y_test
     
-    def scale_inverse(self, X):
-        
-        if("Class" in X.columns):
-            print("Class column found")
-            X = pd.concat([pd.DataFrame(self.scaler_X.inverse_transform(X.loc[:, X.columns != "Class"]), columns=X.columns[:-1]), X.loc[:, "Class"]], axis=1)
-        else:
-            X = self.scaler_X.inverse_transform(X)  
-        
-        return X
+    def scale_inverse(self, *args):
+        result = []
+        for i in args:
+            if "Class" in i.columns:
+                inversed = pd.DataFrame(self.scaler_X.inverse_transform(i.loc[:, i.columns != "Class"]), columns=i.columns[:-1])
+                class_column = i.loc[:, "Class"].reset_index(drop=True)
+                i = pd.concat([inversed, class_column], axis=1)
+            else:
+                i = pd.DataFrame(self.scaler_X.inverse_transform(i), columns=i.columns)
+            result.append(i)
+        return result
           
     def split(self, X, y, ratio=0.8):
         X_train, y_train, X_test, y_test = sk.train_test_split(X, y, test_size = ratio, random_state=self.random_state)
         
         return X_train, y_train, X_test, y_test
     
-    def class_selection(self, X, y, class_label):
-        X = X[X['Class'] == class_label]
-        y = y[y['Class'] == class_label]
+    def class_selection(self, *args, class_label=0):
         
-        return X, y
+        for i in args:
+            i = i[i['Class'] == class_label]
+        
+        return args