From 471038de50439148967b9b0b44418124d62891c6 Mon Sep 17 00:00:00 2001
From: Hannes Signer <signer@uni-potsdam.de>
Date: Wed, 26 Feb 2025 18:51:40 +0100
Subject: [PATCH] update linting

---
 src/preprocessing.py | 93 +++++++++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 40 deletions(-)

diff --git a/src/preprocessing.py b/src/preprocessing.py
index 2f44d5c..7e0711c 100644
--- a/src/preprocessing.py
+++ b/src/preprocessing.py
@@ -95,23 +95,25 @@ def custom_loss(
     loss_variant="huber",
     delta=1.0,
 ):
-    """Custom tensorflow loss function to combine Huber Loss with mass balance.
-       This is inspired by PINN (Physics Informed Neural Networks) where the loss function is a combination of the physics-based loss and the data-driven loss.
-       The mass balance is a physics-based loss that ensures the conservation of mass in the system.
-       A tensorflow loss function accepts only the two arguments y_true and y_pred. Therefore, a nested function is used to pass the additional arguments.
+    """
+    Custom tensorflow loss function to combine Huber Loss with mass balance.
+    This is inspired by PINN (Physics Informed Neural Networks) where the loss function is a combination of the physics-based loss and the data-driven loss.
+    The mass balance is a physics-based loss that ensures the conservation of mass in the system.
+    A tensorflow loss function accepts only the two arguments y_true and y_pred. Therefore, a nested function is used to pass the additional arguments.
 
-       Args:
-           preprocess: preprocessing object
-           column_dict: dictionary with the column names as keys and the corresponding index as values. (i.e {'H': 0, 'O': 1, 'Ba': 2, 'Cl': 3, 'S': 4, 'Sr': 5, 'Barite': 6, 'Celestite': 7})
-           h1: hyperparameter for the importance of the huber loss
-           h2: hyperparameter for the importance of the Barium mass balance term
-           h3: hyperparameter for the importance of the Strontium mass balance term
-           scaler_type: Normalization approach. Choose between "standard" and "minmax". Defaults to "minmax".
-           loss_variant: Loss function approach. Choose between "huber and "huber_mass_balance". Defaults to "huber".
-           delta: Hyperparameter for the Huber function threshold. Defaults to 1.0.
+    Args:
+        preprocess: preprocessing object
+        column_dict: dictionary with the column names as keys and the corresponding index as values. 
+        (i.e {'H': 0, 'O': 1, 'Ba': 2, 'Cl': 3, 'S': 4, 'Sr': 5, 'Barite': 6, 'Celestite': 7})
+        h1: hyperparameter for the importance of the huber loss
+        h2: hyperparameter for the importance of the Barium mass balance term
+        h3: hyperparameter for the importance of the Strontium mass balance term
+        scaler_type: Normalization approach. Choose between "standard" and "minmax". Defaults to "minmax".
+        loss_variant: Loss function approach. Choose between "huber and "huber_mass_balance". Defaults to "huber".
+        delta: Hyperparameter for the Huber function threshold. Defaults to 1.0.
 
-       Returns:
-           loss function
+    Returns:
+        loss function
     """
 
     # as far as I know tensorflow does not directly support the use of scaler objects
@@ -122,12 +124,14 @@ def custom_loss(
                 preprocess.scaler_X.data_range_, dtype=tf.float32
             )
             min_X = tf.convert_to_tensor(
-                preprocess.scaler_X.data_min_, dtype=tf.float32)
+                preprocess.scaler_X.data_min_, dtype=tf.float32
+            )
             scale_y = tf.convert_to_tensor(
                 preprocess.scaler_y.data_range_, dtype=tf.float32
             )
             min_y = tf.convert_to_tensor(
-                preprocess.scaler_y.data_min_, dtype=tf.float32)
+                preprocess.scaler_y.data_min_, dtype=tf.float32
+            )
 
         elif scaler_type == "standard":
             scale_X = tf.convert_to_tensor(
@@ -140,10 +144,14 @@ def custom_loss(
                 preprocess.scaler_y.mean_, dtype=tf.float32)
 
         else:
-            raise Exception("No valid scaler type found. Choose between 'standard' and 'minmax'.")
-    
+            raise Exception(
+                "No valid scaler type found. Choose between 'standard' and 'minmax'."
+            )
+
     except AttributeError:
-        raise Exception("Data normalized with scaler different than specified for the training. Compare the scaling approach on preprocessing and training.") 
+        raise Exception(
+            "Data normalized with scaler different than specified for the training. Compare the scaling approach on preprocessing and training."
+        )
 
     def loss(results, predicted):
         # inverse min/max scaling
@@ -194,7 +202,8 @@ def custom_loss(
             total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr
         else:
             raise Exception(
-                "No valid loss variant found. Choose between 'huber' and 'huber_mass_balance'.")
+                "No valid loss variant found. Choose between 'huber' and 'huber_mass_balance'."
+            )
 
         return total_loss
 
@@ -212,8 +221,7 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
     Returns:
         mean of both mass balance terms
     """
-    
-    
+
     if scaler_type == "minmax":
         scale_X = tf.convert_to_tensor(
             preprocess.scaler_X.data_range_, dtype=tf.float32
@@ -284,7 +292,7 @@ def huber_metric(delta=1.0):
         scaler_type (str, optional): _description_. Defaults to "minmax".
         delta (float, optional): _description_. Defaults to 1.0.
     """
-    
+
     def huber(results, predicted):
         huber_loss = tf.keras.losses.Huber(delta)(results, predicted)
         return huber_loss
@@ -303,7 +311,7 @@ def mass_balance_evaluation(model, X, preprocess):
     Returns:
         vector with the mass balance difference for each cell
     """
-    
+
     # predict the chemistry
     columns = X.iloc[:, X.columns != "Class"].columns
     classes = X["Class"]
@@ -330,26 +338,31 @@ def mass_balance_evaluation(model, X, preprocess):
         (prediction["Sr"] + prediction["Celestite"]) -
         (X["Sr"] + X["Celestite"])
     )
-    
+
     mass_balance_result = pd.DataFrame(
-        {"dBa":dBa, "dSr":dSr, "mass_balance":dBa+dSr, "Class": classes}
+        {"dBa": dBa, "dSr": dSr, "mass_balance": dBa + dSr, "Class": classes}
     )
 
     return mass_balance_result
 
+
 def mass_balance_ratio(results, threshold=1e-5):
     proportion = {}
-    
+
     mass_balance_threshold = results[results["mass_balance"] <= threshold]
-    
+
     overall = len(mass_balance_threshold)
-    class_0_amount = len(mass_balance_threshold[mass_balance_threshold["Class"] == 0])
-    class_1_amount = len(mass_balance_threshold[mass_balance_threshold["Class"] == 1])
-    
+    class_0_amount = len(
+        mass_balance_threshold[mass_balance_threshold["Class"] == 0])
+    class_1_amount = len(
+        mass_balance_threshold[mass_balance_threshold["Class"] == 1])
+
     proportion["overall"] = overall / len(results)
-    proportion["class_0"] = class_0_amount / len(results[results["Class"] == 0])
-    proportion["class_1"] = class_1_amount / len(results[results["Class"] == 1])
-    
+    proportion["class_0"] = class_0_amount / \
+        len(results[results["Class"] == 0])
+    proportion["class_1"] = class_1_amount / \
+        len(results[results["Class"] == 1])
+
     return proportion
 
 
@@ -358,7 +371,7 @@ class preprocessing:
     A class used to preprocess data for model training.
     Attributes
     """
-    
+
     def __init__(self, func_dict_in=None, func_dict_out=None, random_state=42):
         """Initialization of the preprocessing object.
 
@@ -524,7 +537,7 @@ class preprocessing:
             scaling: learn individual scaler for X and y when "individual" is selected or one global scaler on all data in X and y if "global" is selected (scaler_X and scaler_y are equal)
             type (str, optional): Using MinMax Scaling or Standarization. Defaults to "Standard".
         """
-        
+
         if type == "minmax":
             self.scaler_X = MinMaxScaler()
             self.scaler_y = MinMaxScaler()
@@ -552,7 +565,7 @@ class preprocessing:
         self.state["scale"] = True
 
     def scale_transform(self, X_train, X_test, y_train, y_test):
-        """ Apply learned scaler on datasets.
+        """Apply learned scaler on datasets.
 
         Args:
             X_train: design training data
@@ -563,7 +576,7 @@ class preprocessing:
         Returns:
             transformed dataframes
         """
-        
+
         X_train = pd.concat(
             [
                 self.scaler_X.transform(
@@ -608,7 +621,7 @@ class preprocessing:
         Returns:
             Backtransformed data frames
         """
-        
+
         result = []
         for i in args:
             if "Class" in i.columns: