delete old files

update preprocessing
add template for test cases
2025-12-15 19:58:22 +01:00 · 2025-03-27 15:12:56 +01:00 · 2025-03-27 14:59:52 +01:00 · 2025-03-27 14:52:32 +01:00 · 2025-03-27 14:52:18 +01:00
6 changed files with 410 additions and 1335 deletions
--- a/results/adam_history.pkl
+++ b/results/adam_history.pkl
--- a/results/rmsprop_history.pkl
+++ b/results/rmsprop_history.pkl
--- a/results/sgd_history.pkl
+++ b/results/sgd_history.pkl
--- a/src/POET_Training.ipynb
+++ b/src/POET_Training.ipynb
--- a/src/preprocessing.py
+++ b/src/preprocessing.py
@ -160,38 +160,29 @@ def custom_loss(
                preprocess.scaler_type, scaler_type))
    if scaler_type == "minmax":
-        scale_X = tf.convert_to_tensor(
+        data_range = tf.convert_to_tensor(
-            preprocess.scaler_X.data_range_, dtype=tf.float32
+            preprocess.scaler_output.data_range_, dtype=tf.float32
        )
-        min_X = tf.convert_to_tensor(
+        min_values = tf.convert_to_tensor(
-            preprocess.scaler_X.data_min_, dtype=tf.float32
+            preprocess.scaler_output.data_min_, dtype=tf.float32
        )
        scale_y = tf.convert_to_tensor(
            preprocess.scaler_y.data_range_, dtype=tf.float32
        )
        min_y = tf.convert_to_tensor(
            preprocess.scaler_y.data_min_, dtype=tf.float32
        )
    elif scaler_type == "standard":
-        scale_X = tf.convert_to_tensor(
+        scale_output = tf.convert_to_tensor(
-            preprocess.scaler_X.scale_, dtype=tf.float32)
+            preprocess.scaler_output.scale_, dtype=tf.float32)
-        mean_X = tf.convert_to_tensor(
+        mean_output = tf.convert_to_tensor(
-            preprocess.scaler_X.mean_, dtype=tf.float32)
+            preprocess.scaler_output.mean_, dtype=tf.float32)
        scale_y = tf.convert_to_tensor(
            preprocess.scaler_y.scale_, dtype=tf.float32)
        mean_y = tf.convert_to_tensor(
            preprocess.scaler_y.mean_, dtype=tf.float32)
    def loss(results, predicted):
        # inverse min/max scaling
        if scaler_type == "minmax":
-            predicted_inverse = predicted * scale_y + min_y
+            predicted_inverse = predicted * data_range + min_values
-            results_inverse = results * scale_X + min_X
+            results_inverse = results * data_range + min_values
        # inverse standard scaling
        elif scaler_type == "standard":
-            predicted_inverse = predicted * scale_y + mean_y
+            predicted_inverse = predicted * scale_output + mean_output
-            results_inverse = results * scale_X + mean_X
+            results_inverse = results * scale_output + mean_output
        elif scaler_type == "none":
            predicted_inverse = predicted
@ -204,6 +195,8 @@ def custom_loss(
        # mass balance
        # in total no Barium and Strontium should be lost in one simulation step
        # TODO: encapsulate the mass balance terms in a function
        dBa = tf.keras.backend.abs(
            (
                predicted_inverse[:, column_dict["Ba"]]
@ -224,6 +217,19 @@ def custom_loss(
                + results_inverse[:, column_dict["Celestite"]]
            )
        )
        dS = tf.keras.backend.abs(
            (
                predicted_inverse[:, column_dict["S"]]
                + predicted_inverse[:, column_dict["Celestite"]]
                + predicted_inverse[:, column_dict["Barite"]]
            )
            - (
                results_inverse[:, column_dict["S"]]
                + results_inverse[:, column_dict["Celestite"]]
                + results_inverse[:, column_dict["Barite"]]
            )
        )
        # huber loss
        huber_loss = tf.keras.losses.Huber(delta)(results, predicted)
@ -233,6 +239,8 @@ def custom_loss(
            total_loss = huber_loss
        elif loss_variant == "huber_mass_balance":
            total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr
        elif "huber_mass_balance_extended":
            total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr + h3 * dS
        else:
            raise Exception(
                "No valid loss variant found. Choose between 'huber' and 'huber_mass_balance'."
@ -243,7 +251,7 @@ def custom_loss(
    return loss
-def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
+def mass_balance_metric(preprocess, column_dict, scaler_type="minmax", loss_variant="huber_mass_balance"):
    """Auxilary function to calculate the mass balance during training.
    Args:
@ -256,36 +264,29 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
    """
    if scaler_type == "minmax":
-        scale_X = tf.convert_to_tensor(
+        data_range = tf.convert_to_tensor(
-            preprocess.scaler_X.data_range_, dtype=tf.float32
+            preprocess.scaler_output.data_range_, dtype=tf.float32
        )
-        min_X = tf.convert_to_tensor(
+        min_values = tf.convert_to_tensor(
-            preprocess.scaler_X.data_min_, dtype=tf.float32)
+            preprocess.scaler_output.data_min_, dtype=tf.float32
        scale_y = tf.convert_to_tensor(
            preprocess.scaler_y.data_range_, dtype=tf.float32
        )
-        min_y = tf.convert_to_tensor(
+  
            preprocess.scaler_y.data_min_, dtype=tf.float32)
    elif scaler_type == "standard":
-        scale_X = tf.convert_to_tensor(
+        scale_output = tf.convert_to_tensor(
-            preprocess.scaler_X.scale_, dtype=tf.float32)
+            preprocess.scaler_output.scale_, dtype=tf.float32)
-        mean_X = tf.convert_to_tensor(
+        mean_output = tf.convert_to_tensor(
-            preprocess.scaler_X.mean_, dtype=tf.float32)
+            preprocess.scaler_output.mean_, dtype=tf.float32)
        scale_y = tf.convert_to_tensor(
            preprocess.scaler_y.scale_, dtype=tf.float32)
        mean_y = tf.convert_to_tensor(
            preprocess.scaler_y.mean_, dtype=tf.float32)
    def mass_balance(results, predicted):
        # inverse min/max scaling
        if scaler_type == "minmax":
-            predicted_inverse = predicted * scale_y + min_y
+            predicted_inverse = predicted * data_range + min_values
-            results_inverse = results * scale_X + min_X
+            results_inverse = results * data_range + min_values
        # inverse standard scaling
        elif scaler_type == "standard":
-            predicted_inverse = predicted * scale_y + mean_y
+            predicted_inverse = predicted * scale_output + mean_output
-            results_inverse = results * scale_X + mean_X
+            results_inverse = results * scale_output + mean_output
        elif scaler_type == "none":
            predicted_inverse = predicted
@ -306,6 +307,7 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
                + results_inverse[:, column_dict["Barite"]]
            )
        )
        dSr = tf.keras.backend.abs(
            (
                predicted_inverse[:, column_dict["Sr"]]
@ -316,11 +318,74 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
                + results_inverse[:, column_dict["Celestite"]]
            )
        )
-        return tf.reduce_mean(dBa + dSr)
+        
        dS = tf.keras.backend.abs(
            (
                predicted_inverse[:, column_dict["S"]]
                + predicted_inverse[:, column_dict["Celestite"]]
                + predicted_inverse[:, column_dict["Barite"]]
            )
            - (
                results_inverse[:, column_dict["S"]]
                + results_inverse[:, column_dict["Celestite"]]
                + results_inverse[:, column_dict["Barite"]]
            )
        )
        if loss_variant == "huber_mass_balance":
            return tf.reduce_mean(dBa + dSr)
        elif loss_variant == "huber_mass_balance_extended":
            return tf.reduce_mean(dBa + dSr + dS)
    return mass_balance
 # def mass_balance_barium(predicted_inverse, results_inverse, column_dict):
 #     dBa = tf.keras.backend.abs(
 #             (
 #                 predicted_inverse[:, column_dict["Ba"]]
 #                 + predicted_inverse[:, column_dict["Barite"]]
 #             )
 #             - (
 #                 results_inverse[:, column_dict["Ba"]]
 #                 + results_inverse[:, column_dict["Barite"]]
 #             )
 #         )
 #     return dBa
 # def mass_balance_strontium(predicted_inverse, results_inverse, column_dict):
 #     dSr = tf.keras.backend.abs(
 #             (
 #                 predicted_inverse[:, column_dict["Sr"]]
 #                 + predicted_inverse[:, column_dict["Celestite"]]
 #             )
 #             - (
 #                 results_inverse[:, column_dict["Sr"]]
 #                 + results_inverse[:, column_dict["Celestite"]]
 #             )
 #         )
 #     return dSr
 # def mass_balance_sulfur(predicted_inverse, results_inverse, column_dict):
 #     dS = tf.keras.backend.abs(
 #             (
 #                 predicted_inverse[:, column_dict["S"]]
 #                 + predicted_inverse[:, column_dict["Celestite"]]
 #                 + predicted_inverse[:, column_dict["Barite"]]
 #             )
 #             - (
 #                 results_inverse[:, column_dict["S"]]
 #                 + results_inverse[:, column_dict["Celestite"]]
 #                 + results_inverse[:, column_dict["Barite"]]
 #             )
 #         )
 #     return dS
 def huber_metric(delta=1.0):
    """Auxilary function to calculate the Huber loss during training.
@ -337,8 +402,9 @@ def huber_metric(delta=1.0):
    return huber
-def mass_balance_evaluation(model, X, y, preprocess):
+def mass_balance_evaluation(model, X, preprocess):
-    """Calculates the mass balance difference for each cell.
+    """Calculates the mass balance difference for each cell 
    between the predicted values and the design dataset.
    Args:
        model: trained model
@ -353,18 +419,12 @@ def mass_balance_evaluation(model, X, y, preprocess):
    columns = X.iloc[:, X.columns != "Class"].columns
    classes = X["Class"]
    classes.reset_index(drop=True, inplace=True)
-    prediction = pd.DataFrame(model.predict(X[columns]), columns=y.columns)
+    prediction = pd.DataFrame(model.predict(X[columns]), columns=preprocess.scaler_output.feature_names_in_)
    # backtransform min/max or standard scaler
-
+    if preprocess.scaler_input is not None:
-    if preprocess.scaler_X is not None:
+        X = preprocess.scale_inverse(X)[0]
-        X = pd.DataFrame(
+        prediction = preprocess.scale_inverse(prediction)[0]
            preprocess.scaler_X.inverse_transform(
                X.iloc[:, X.columns != "Class"]),
            columns=columns,
        )
        prediction = pd.DataFrame(
            preprocess.scaler_y.inverse_transform(prediction), columns=columns
        )
    # apply backtransformation if log transformation was applied
    if preprocess.func_dict_out is not None:
@ -378,9 +438,12 @@ def mass_balance_evaluation(model, X, y, preprocess):
        (prediction["Sr"] + prediction["Celestite"]) -
        (X["Sr"] + X["Celestite"])
    )
    dS = np.abs(
        (prediction["S"] + prediction["Celestite"] + prediction["Barite"]) -
        (X["S"] + X["Celestite"] + X["Barite"]))
    mass_balance_result = pd.DataFrame(
-        {"dBa": dBa, "dSr": dSr, "mass_balance": dBa + dSr, "Class": classes}
+        {"dBa": dBa, "dSr": dSr, "dS": dS, "mass_balance": dBa + dSr, "mass_balance_extended": dBa+dSr+dS, "Class": classes}
    )
    return mass_balance_result
@ -421,8 +484,8 @@ class preprocessing:
            random_state (int, optional): Seed for reproducability. Defaults to 42.
        """
        self.random_state = random_state
-        self.scaler_X = None
+        self.scaler_input = None
-        self.scaler_y = None
+        self.scaler_output = None
        self.func_dict_in = func_dict_in if func_dict_in is not None else None
        self.func_dict_out = func_dict_out if func_dict_out is not None else None
        self.state = {"cluster": False, "log": False,
@ -500,8 +563,10 @@ class preprocessing:
        label = np.zeros(len(X))
        label[X[species] > threshold] = 1
-        X["Class"] = label
+        X = X.copy()
-        y["Class"] = label
+        y = y.copy()
        X.loc[:, "Class"] = label
        y.loc[:, "Class"] = label
        return X, y
@ -584,52 +649,47 @@ class preprocessing:
        self.state["balance"] = True
        return design_resampled, target_resampled
-    def scale_fit(self, X, y, scaling, type="standard"):
+    def scale_fit(self, X, y, type="standard"):
        self.scaler_type = type
        self.scaler_scope = scaling
        """Fit a scaler for data preprocessing.
        Args:
            X: design dataset
            y: target dataset
-            scaling: learn individual scaler for X and y when "individual" is selected or one global scaler on all data in X and y if "global" is selected (scaler_X and scaler_y are equal)
+            scaling: fit a scaler on all data in X and y. If X and y have different dimensions
                     input and output scaler are trained for the specific columns.
            type (str, optional): Using MinMax Scaling or Standarization. Defaults to "Standard".
        """
        if type == "minmax":
-            self.scaler_X = MinMaxScaler()
+            self.scaler_input = MinMaxScaler()
-            self.scaler_y = MinMaxScaler()
+            self.scaler_output = MinMaxScaler()
        elif type == "standard":
-            self.scaler_X = StandardScaler()
+            self.scaler_input = StandardScaler()
-            self.scaler_y = StandardScaler()
+            self.scaler_output = StandardScaler()
        else:
            raise Exception("No valid scaler type found")
        if scaling == "individual":
            self.scaler_X.fit(X.iloc[:, X.columns != "Class"])
            self.scaler_y.fit(y.iloc[:, y.columns != "Class"])
-        elif scaling == "global":
+        all_data = pd.concat([X, y],axis=0)
-            self.scaler_X.fit(
+        
-                pd.concat(
+        if len(X.columns) == len(y.columns):
-                    [X.iloc[:, X.columns != "Class"],
+            self.scaler_input.fit(all_data.loc[:, X.columns != "Class"])
-                        y.iloc[:, y.columns != "Class"]],
+            self.scaler_output = self.scaler_input
-                    axis=0,
+            
-                )
+        else:
-            )
+            self.scaler_input.fit(all_data[X.columns[X.columns != "Class"]])
-            self.scaler_y = self.scaler_X
+            self.scaler_output.fit(all_data[y.columns[y.columns != "Class"]])
        self.state["scale"] = True
-        return pd.concat(
+        
-                    [X.iloc[:, X.columns != "Class"],
+    
                        y.iloc[:, y.columns != "Class"]],
                    axis=0,
                )
-    def scale_transform(self, X_train, X_test, y_train, y_test):
+    def scale_transform(self, *args):
        """Apply learned scaler on datasets.
        Args:
@ -641,82 +701,31 @@ class preprocessing:
        Returns:
            transformed dataframes
        """
-
+        
-        if self.scaler_scope == "global":
+        results = []
-            if len(X_train.columns) > len(y_train.columns):
+        for i in args:
-                y_train_modified = X_train.copy()
+            # check which scaler should be used depending on the columns
-                y_test_modified = X_test.copy()
+            if len(i.columns[i.columns != "Class"]) == len(self.scaler_input.feature_names_in_):
-
+                scaler = self.scaler_input
                for i in y_train_modified.columns:
                    if i in y_train.columns:
                        y_train_modified[i] = y_train[i]
                        y_test_modified[i] = y_test[i]
                    else:
                        y_train_modified[i] = np.nan
                        y_test_modified[i] = np.nan
                y_train = y_train_modified
                y_test = y_test_modified
            else:
-                X_train_modified = y_train.copy()
+                scaler = self.scaler_output
-                X_test_modified = y_test.copy()
+            
-
+            if "Class" in i.columns:
-                for i in X_train_modified.columns:
+                i = pd.concat(
-                    if i in X_train.columns:
+                    [
-                        X_train_modified[i] = X_train[i]
+                        scaler.transform(i.loc[:, i.columns != "Class"]),
-                        X_test_modified[i] = X_test[i]
+                        i.loc[:, "Class"],
-                    else:
+                    ],
-                        X_train_modified[i] = np.nan
+                    axis=1,
-                        X_test_modified[i] = np.nan
+                )
-
+            else:
-                X_train = X_train_modified
+                i = scaler.transform(i)
-                X_test = X_test_modified
+                
-
+            results.append(i)
-
+            
-        X_train = pd.concat(
+            
-            [
+        return results
-                self.scaler_X.transform(
+       
                    X_train.loc[:, X_train.columns != "Class"]),
                X_train.loc[:, "Class"],
            ],
            axis=1,
        )
        X_test = pd.concat(
            [
                self.scaler_X.transform(
                    X_test.loc[:, X_test.columns != "Class"]),
                X_test.loc[:, "Class"],
            ],
            axis=1,
        )
        y_train = pd.concat(
            [
                self.scaler_y.transform(
                    y_train.loc[:, y_train.columns != "Class"]),
                y_train.loc[:, "Class"],
            ],
            axis=1,
        )
        y_test = pd.concat(
            [
                self.scaler_y.transform(
                    y_test.loc[:, y_test.columns != "Class"]),
                y_test.loc[:, "Class"],
            ],
            axis=1,
        )
        X_train.dropna(axis=1, inplace=True)
        X_test.dropna(axis=1, inplace=True) 
        y_train.dropna(axis=1, inplace=True)    
        y_test.dropna(axis=1, inplace=True) 
        return X_train, X_test, y_train, y_test
    def scale_inverse(self, *args):
        """Backtransform the dataset
@ -725,65 +734,28 @@ class preprocessing:
            Backtransformed data frames
        """
        result = []
-        
+
-        
+            
-        if self.scaler_scope == "individual":
+        for i in args:
-            for i in args:
+            # check which scaler should be used depending on the columns
-                if(len(i.columns) == len(self.scaler_X.feature_names_in_)):
+            if len(i.columns[i.columns != "Class"]) == len(self.scaler_input.feature_names_in_):
-                    scaler = self.scaler_X
+                scaler = self.scaler_input
-                else:
+            else:
-                    scaler = self.scaler_y
+                scaler = self.scaler_output
-                if "Class" in i.columns:
+            
-                    inversed = pd.DataFrame(
+            if "Class" in i.columns:
-                        scaler.inverse_transform(
+                inversed = pd.DataFrame(
-                            i.loc[:, i.columns != "Class"]),
+                    scaler.inverse_transform(
-                        columns=i.columns[:-1],
+                        i.loc[:, i.columns != "Class"]),
-                    )
+                    columns=i.columns[:-1],
-                    class_column = i.loc[:, "Class"].reset_index(drop=True)
+                )
-                    i = pd.concat([inversed, class_column], axis=1)
+                class_column = i.loc[:, "Class"].reset_index(drop=True)
-                else:
+                i = pd.concat([inversed, class_column], axis=1)
            else:
                    i = pd.DataFrame(
                        scaler.inverse_transform(i), columns=i.columns)
-                result.append(i)
+            result.append(i)
        elif self.scaler_scope == "global":
            for i in args:
                if (len(i.columns) == len(self.preprocess.scaler_X.feature_names_in_)):
                    if "Class" in i.columns:
                        inversed = pd.DataFrame(
                            self.scaler_X.inverse_transform(
                                i.loc[:, i.columns != "Class"]),
                            columns=i.columns[:-1],
                        )
                        class_column = i.loc[:, "Class"].reset_index(drop=True)
                        i = pd.concat([inversed, class_column], axis=1)
                    else:
                        i = pd.DataFrame(
                            self.scaler_X.inverse_transform(i), columns=i.columns)
                    result.append(i)
                else:
                    df = pd.DataFrame()
                    for j in self.scaler_X.feature_names_in_:
                        if j in i.columns:
                            df[j] = i[j]
                        else:
                            df[j] = np.nan
                    if "Class" in i.columns:
                        inversed = pd.DataFrame(
                            self.scaler_X.inverse_transform(
                                df.loc[:, df.columns != "Class"]),
                            columns=df.columns[:-1],
                        )
                    else:
                        i = pd.DataFrame(
                            self.scaler_X.inverse_transform(df), columns=df.columns)
                    result.append(i)
        return result
--- a/tests/test_scaler.py
+++ b/tests/test_scaler.py
@ -0,0 +1,14 @@
 # import unittest
 # import os
 # os.chdir("../src/")
 # print(os.getcwd())  
 # from preprocessing import *
 # class TestScaler(unittest.TestCase):
 #     def sample_test(self):
 #         self.assertEqual(1, 1)
 # if __name__ == '__main__':
 #     unittest.main()
Author	SHA1	Message	Date
Hannes Signer	09a5687580	delete old files	2025-03-27 15:12:56 +01:00
Hannes Signer	bbccd1444d	update preprocessing	2025-03-27 14:59:52 +01:00
Hannes Signer	e21c7bede8	add template for test cases	2025-03-27 14:52:32 +01:00
Hannes Signer	5b520c368d	adapt scaler for different number of input and output features	2025-03-27 14:52:18 +01:00