From 471038de50439148967b9b0b44418124d62891c6 Mon Sep 17 00:00:00 2001 From: Hannes Signer Date: Wed, 26 Feb 2025 18:51:40 +0100 Subject: [PATCH] update linting --- src/preprocessing.py | 93 +++++++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 40 deletions(-) diff --git a/src/preprocessing.py b/src/preprocessing.py index 2f44d5c..7e0711c 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -95,23 +95,25 @@ def custom_loss( loss_variant="huber", delta=1.0, ): - """Custom tensorflow loss function to combine Huber Loss with mass balance. - This is inspired by PINN (Physics Informed Neural Networks) where the loss function is a combination of the physics-based loss and the data-driven loss. - The mass balance is a physics-based loss that ensures the conservation of mass in the system. - A tensorflow loss function accepts only the two arguments y_true and y_pred. Therefore, a nested function is used to pass the additional arguments. + """ + Custom tensorflow loss function to combine Huber Loss with mass balance. + This is inspired by PINN (Physics Informed Neural Networks) where the loss function is a combination of the physics-based loss and the data-driven loss. + The mass balance is a physics-based loss that ensures the conservation of mass in the system. + A tensorflow loss function accepts only the two arguments y_true and y_pred. Therefore, a nested function is used to pass the additional arguments. - Args: - preprocess: preprocessing object - column_dict: dictionary with the column names as keys and the corresponding index as values. (i.e {'H': 0, 'O': 1, 'Ba': 2, 'Cl': 3, 'S': 4, 'Sr': 5, 'Barite': 6, 'Celestite': 7}) - h1: hyperparameter for the importance of the huber loss - h2: hyperparameter for the importance of the Barium mass balance term - h3: hyperparameter for the importance of the Strontium mass balance term - scaler_type: Normalization approach. Choose between "standard" and "minmax". Defaults to "minmax". - loss_variant: Loss function approach. Choose between "huber and "huber_mass_balance". Defaults to "huber". - delta: Hyperparameter for the Huber function threshold. Defaults to 1.0. + Args: + preprocess: preprocessing object + column_dict: dictionary with the column names as keys and the corresponding index as values. + (i.e {'H': 0, 'O': 1, 'Ba': 2, 'Cl': 3, 'S': 4, 'Sr': 5, 'Barite': 6, 'Celestite': 7}) + h1: hyperparameter for the importance of the huber loss + h2: hyperparameter for the importance of the Barium mass balance term + h3: hyperparameter for the importance of the Strontium mass balance term + scaler_type: Normalization approach. Choose between "standard" and "minmax". Defaults to "minmax". + loss_variant: Loss function approach. Choose between "huber and "huber_mass_balance". Defaults to "huber". + delta: Hyperparameter for the Huber function threshold. Defaults to 1.0. - Returns: - loss function + Returns: + loss function """ # as far as I know tensorflow does not directly support the use of scaler objects @@ -122,12 +124,14 @@ def custom_loss( preprocess.scaler_X.data_range_, dtype=tf.float32 ) min_X = tf.convert_to_tensor( - preprocess.scaler_X.data_min_, dtype=tf.float32) + preprocess.scaler_X.data_min_, dtype=tf.float32 + ) scale_y = tf.convert_to_tensor( preprocess.scaler_y.data_range_, dtype=tf.float32 ) min_y = tf.convert_to_tensor( - preprocess.scaler_y.data_min_, dtype=tf.float32) + preprocess.scaler_y.data_min_, dtype=tf.float32 + ) elif scaler_type == "standard": scale_X = tf.convert_to_tensor( @@ -140,10 +144,14 @@ def custom_loss( preprocess.scaler_y.mean_, dtype=tf.float32) else: - raise Exception("No valid scaler type found. Choose between 'standard' and 'minmax'.") - + raise Exception( + "No valid scaler type found. Choose between 'standard' and 'minmax'." + ) + except AttributeError: - raise Exception("Data normalized with scaler different than specified for the training. Compare the scaling approach on preprocessing and training.") + raise Exception( + "Data normalized with scaler different than specified for the training. Compare the scaling approach on preprocessing and training." + ) def loss(results, predicted): # inverse min/max scaling @@ -194,7 +202,8 @@ def custom_loss( total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr else: raise Exception( - "No valid loss variant found. Choose between 'huber' and 'huber_mass_balance'.") + "No valid loss variant found. Choose between 'huber' and 'huber_mass_balance'." + ) return total_loss @@ -212,8 +221,7 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"): Returns: mean of both mass balance terms """ - - + if scaler_type == "minmax": scale_X = tf.convert_to_tensor( preprocess.scaler_X.data_range_, dtype=tf.float32 @@ -284,7 +292,7 @@ def huber_metric(delta=1.0): scaler_type (str, optional): _description_. Defaults to "minmax". delta (float, optional): _description_. Defaults to 1.0. """ - + def huber(results, predicted): huber_loss = tf.keras.losses.Huber(delta)(results, predicted) return huber_loss @@ -303,7 +311,7 @@ def mass_balance_evaluation(model, X, preprocess): Returns: vector with the mass balance difference for each cell """ - + # predict the chemistry columns = X.iloc[:, X.columns != "Class"].columns classes = X["Class"] @@ -330,26 +338,31 @@ def mass_balance_evaluation(model, X, preprocess): (prediction["Sr"] + prediction["Celestite"]) - (X["Sr"] + X["Celestite"]) ) - + mass_balance_result = pd.DataFrame( - {"dBa":dBa, "dSr":dSr, "mass_balance":dBa+dSr, "Class": classes} + {"dBa": dBa, "dSr": dSr, "mass_balance": dBa + dSr, "Class": classes} ) return mass_balance_result + def mass_balance_ratio(results, threshold=1e-5): proportion = {} - + mass_balance_threshold = results[results["mass_balance"] <= threshold] - + overall = len(mass_balance_threshold) - class_0_amount = len(mass_balance_threshold[mass_balance_threshold["Class"] == 0]) - class_1_amount = len(mass_balance_threshold[mass_balance_threshold["Class"] == 1]) - + class_0_amount = len( + mass_balance_threshold[mass_balance_threshold["Class"] == 0]) + class_1_amount = len( + mass_balance_threshold[mass_balance_threshold["Class"] == 1]) + proportion["overall"] = overall / len(results) - proportion["class_0"] = class_0_amount / len(results[results["Class"] == 0]) - proportion["class_1"] = class_1_amount / len(results[results["Class"] == 1]) - + proportion["class_0"] = class_0_amount / \ + len(results[results["Class"] == 0]) + proportion["class_1"] = class_1_amount / \ + len(results[results["Class"] == 1]) + return proportion @@ -358,7 +371,7 @@ class preprocessing: A class used to preprocess data for model training. Attributes """ - + def __init__(self, func_dict_in=None, func_dict_out=None, random_state=42): """Initialization of the preprocessing object. @@ -524,7 +537,7 @@ class preprocessing: scaling: learn individual scaler for X and y when "individual" is selected or one global scaler on all data in X and y if "global" is selected (scaler_X and scaler_y are equal) type (str, optional): Using MinMax Scaling or Standarization. Defaults to "Standard". """ - + if type == "minmax": self.scaler_X = MinMaxScaler() self.scaler_y = MinMaxScaler() @@ -552,7 +565,7 @@ class preprocessing: self.state["scale"] = True def scale_transform(self, X_train, X_test, y_train, y_test): - """ Apply learned scaler on datasets. + """Apply learned scaler on datasets. Args: X_train: design training data @@ -563,7 +576,7 @@ class preprocessing: Returns: transformed dataframes """ - + X_train = pd.concat( [ self.scaler_X.transform( @@ -608,7 +621,7 @@ class preprocessing: Returns: Backtransformed data frames """ - + result = [] for i in args: if "Class" in i.columns: