Merge branch 'main' of git.gfz-potsdam.de:naaice/model-training

Merge branch 'loss-experiment'
update julia script
2025-12-13 10:38:22 +01:00 · 2025-02-27 12:05:31 +01:00 · 2025-02-27 12:04:13 +01:00 · 2025-02-27 11:44:51 +01:00 · 2025-02-27 11:42:05 +01:00 · 2025-02-26 18:51:40 +01:00
16 changed files with 1934 additions and 625 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+__pycache__/preprocessing.cpython-311.pyc
--- a/Barite_50_Data.h5
+++ b/Barite_50_Data.h5
--- a/Barite_50_Data_inference.h5
+++ b/Barite_50_Data_inference.h5
--- a/POET_Training.ipynb
+++ b/POET_Training.ipynb
--- a/convert_data.jl
+++ b/convert_data.jl
@ -0,0 +1,60 @@
+using HDF5
+using RData
+
+using DataFrames
+
+# Load Training Data 
+# train_data = load("Barite_50_Data.rds")
+
+# training_h5_name = "Barite_50_Data.h5"
+# h5open(training_h5_name, "w") do fid
+#     for key in keys(train_data)
+#         group = create_group(fid, key)
+#         group["names"] = names(train_data[key])
+#         group["data", compress=3] = Matrix(train_data[key])
+#         # group = create_group(fid, key)
+#         # grou["names"] = coln
+#     end
+# end
+
+# List all .rds files starting with "iter" in a given directory
+rds_files = filter(x -> startswith(x, "iter"), readdir("barite_out/"))
+
+# remove "iter_0.rds" from the list 
+rds_files = rds_files[2:end]
+
+big_df_in = DataFrame()
+big_df_out = DataFrame()
+
+for rds_file in rds_files
+    # Load the RDS file
+    data = load("barite_out/$rds_file")
+    # Convert the data to a DataFrame
+    df_T = DataFrame(data["T"])
+    df_C = DataFrame(data["C"])
+    # Append the DataFrame to the big DataFrame
+    append!(big_df_in, df_T)
+    append!(big_df_out, df_C)
+end
+
+# remove ID, Barite_p1, Celestite_p1 columns
+big_df_in = big_df_in[:, Not([:ID, :Barite_p1, :Celestite_p1])]
+big_df_out = big_df_out[:, Not([:ID, :Barite_p1, :Celestite_p1])]
+
+inference_h5_name = "Barite_50_Data_inference.h5"
+h5open(inference_h5_name, "w") do fid
+    fid["names"] = names(big_df_in)
+    fid["data", compress=9] = Matrix(big_df_in)
+end
+
+training_h5_name = "Barite_50_Data_training.h5"
+h5open(training_h5_name, "w") do fid
+    group_in = create_group(fid, "design")
+    group_out = create_group(fid, "result")
+
+    group_in["names"] = names(big_df_in)
+    group_in["data", compress=9] = Matrix(big_df_in)
+
+    group_out["names"] = names(big_df_out)
+    group_out["data", compress=9] = Matrix(big_df_out)
+end
--- a/datasets/.gitattributes
+++ b/datasets/.gitattributes
@ -0,0 +1,2 @@
+Barite_50_Data_training.h5 filter=lfs diff=lfs merge=lfs -text
+{file_name} filter=lfs diff=lfs merge=lfs -text
--- a/datasets/Barite_50_Data_training.h5
+++ b/datasets/Barite_50_Data_training.h5
--- a/datasets/barite_50_4_corner.h5
+++ b/datasets/barite_50_4_corner.h5
--- a/doc/measurement_plan.md
+++ b/doc/measurement_plan.md
@ -0,0 +1,24 @@
+# Measurement overview for model optimization
+
+### Parameters to optimize
+
+
+
+
+
+
+
+### Saved models
+
+`./results/model_large_standardization.keras`: Trained on `barite_50_4_corner.h5` dataset with extended Loss function (Huber loss with mass balance) and **standardized data**
+
+
+
+
+
+### Experiments
+
+| **Experiment** | **Dataset**           | **Model** | **Lossfunction** | **Activation** | **Preprocessing** |
+|----------------|-----------------------|-----------|------------------|----------------|-------------------|-------------------|
+| 1              | barite_50_4_corner.h5 | large     | Huber+dBa+dSa    | LeakuRelu      | Standardization   |
+| 2              | barite_50_4_corner.h5 | 
--- a/model_large.keras
+++ b/model_large.keras
--- a/results/models/model_large_minmax.keras
+++ b/results/models/model_large_minmax.keras
--- a/results/models/model_large_standardization.keras
+++ b/results/models/model_large_standardization.keras
--- a/src/POET_Training.ipynb
+++ b/src/POET_Training.ipynb
--- a/src/convert_data.jl
+++ b/src/convert_data.jl
@ -0,0 +1,88 @@
+#!/usr/bin/env julia
+
+# qs_read = []
+
+
+# # find all the files in 'barite_out'
+# files = readdir("barite_out"; join=true)
+
+# # remove files which do not have the extension '.qs2' and contains 'iter'
+# files = filter(x -> occursin(r".*\.qs2", x) && occursin(r"iter", x), files)
+
+# # remove first entry as it is iteration 0
+# files = files[2:end]
+
+# test1 = qs_read(files[1])
+
+# @rput test1
+
+# R"test1 <- test1$C"
+
+# @rget test1
+
+# check if ARGS contains 2 elements
+if length(ARGS) != 2
+    println("Usage: julia convert.jl <directory> <output_file_name>.h5")
+    exit(1)
+end
+
+to_read_dir = ARGS[1]
+output_file_name = ARGS[2] * ".h5"
+
+# check if the directory exists
+if !isdir(to_read_dir)
+    println("The directory \"$to_read_dir\" does not exist")
+    exit(1)
+end
+
+using HDF5, RCall, DataFrames
+@rlibrary qs2
+
+# List all .rds files starting with "iter" in a given directory
+qs_files = filter(x -> occursin(r".*\.qs2", x) && occursin(r"iter", x), readdir(to_read_dir; join=true))[2:end]
+
+df_design = DataFrame()
+df_result = DataFrame()
+
+for file in qs_files
+    # Load the RDS file
+    data = qs_read(file)
+
+    # get basename of the file
+    basename = split(file, "/")[end]
+
+    # get the iteration number by splitting the basename and parse the second element
+    iteration = parse(Int, split(split(basename, "_")[2], ".")[1])
+
+    @rput data
+
+    R"transport <- data$T"
+    R"chemistry <- data$C"
+
+    @rget transport
+    @rget chemistry
+
+    # Add iteration number to the DataFrame
+    transport.iteration = fill(iteration, size(transport, 1))
+    chemistry.iteration = fill(iteration, size(chemistry, 1))
+
+    # Append the DataFrame to the big DataFrame
+    append!(df_design, transport)
+    append!(df_result, chemistry)
+end
+
+# remove ID, Barite_p1, Celestite_p1 columns
+df_design = df_design[:, Not([:ID, :Barite_p1, :Celestite_p1])]
+df_result = df_result[:, Not([:ID, :Barite_p1, :Celestite_p1])]
+
+
+h5open(output_file_name, "w") do fid
+    group_in = create_group(fid, "design")
+    group_out = create_group(fid, "result")
+
+    group_in["names"] = names(df_design)
+    group_in["data", compress=9] = Matrix(df_design)
+
+    group_out["names"] = names(df_result)
+    group_out["data", compress=9] = Matrix(df_result)
+end
--- a/src/optuna_runs.py
+++ b/src/optuna_runs.py
@ -0,0 +1,138 @@
+import keras
+from keras.layers import Dense, Dropout, Input,BatchNormalization
+import tensorflow as tf
+import h5py
+import numpy as np
+import pandas as pd
+import time
+import sklearn.model_selection as sk
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from imblearn.over_sampling import SMOTE
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import RandomOverSampler
+from collections import Counter
+import os
+from preprocessing import *
+from sklearn import set_config
+from importlib import reload
+set_config(transform_output = "pandas")
+import optuna
+import pickle
+
+def objective(trial, X, y, species_columns):
+    
+    model_type = trial.suggest_categorical("model", ["small", "large", "paper"])
+    scaler_type = trial.suggest_categorical("scaler", ["standard", "minmax"])
+    sampling_type = trial.suggest_categorical("sampling", ["over", "off"])
+    loss_variant = trial.suggest_categorical("loss", ["huber", "huber_mass_balance"])
+    delta = trial.suggest_float("delta", 0.5, 5.0)
+    
+    preprocess = preprocessing()
+    X, y = preprocess.cluster(df_design[species_columns], df_results[species_columns])
+    X_train, X_test, y_train, y_test = preprocess.split(X, y, ratio = 0.2)
+    X_train, y_train = preprocess.balancer(X_train, y_train, strategy = sampling_type)
+    preprocess.scale_fit(X_train, y_train, scaling = "global", type=scaler_type)
+    X_train, X_test, y_train, y_test = preprocess.scale_transform(X_train, X_test, y_train, y_test)
+    X_train, X_val, y_train, y_val = preprocess.split(X_train, y_train, ratio = 0.1)
+
+    column_dict = {"Ba": X.columns.get_loc("Ba"), "Barite": X.columns.get_loc("Barite"), "Sr": X.columns.get_loc(
+        "Sr"), "Celestite": X.columns.get_loc("Celestite"), "H": X.columns.get_loc("H"), "H": X.columns.get_loc("H"), "O": X.columns.get_loc("O")}
+
+    h1 = trial.suggest_float("h1", 0.1, 1.0)
+    h2 = trial.suggest_float("h2", 0.1, 1.0)
+    h3 = trial.suggest_float("h3", 0.1, 1.0)
+
+    model = model_definition(model_type)
+    
+    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
+        initial_learning_rate=0.001,
+        decay_steps=2000,
+        decay_rate=0.9,
+        staircase=True
+    )
+    optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)
+    
+    model.compile(optimizer=optimizer, loss=custom_loss(preprocess, column_dict, h1, h2, h3, scaler_type, loss_variant, delta),
+                  metrics=[huber_metric(preprocess, scaler_type, delta), mass_balance_metric(preprocess, column_dict, scaler_type)])
+    
+    callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
+    history = model.fit(X_train.loc[:, X_train.columns != "Class"], 
+                        y_train.loc[:, y_train.columns != "Class"], 
+                        batch_size=512, 
+                        epochs=100, 
+                        validation_data=(X_val.loc[:, X_val.columns != "Class"], y_val.loc[:, y_val.columns != "Class"]),
+                        callbacks=[callback])
+    
+    prediction_huber_overall = model.evaluate(
+        X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"])[1]
+    prediction_huber_non_reactive = model.evaluate(
+        X_test[X_test['Class'] == 0].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 0].iloc[:, y_test.columns != "Class"])[1]
+    prediction_huber_reactive = model.evaluate(
+        X_test[X_test['Class'] == 1].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 1].iloc[:, y_test.columns != "Class"])[1]
+    prediction_mass_balance_overall = model.evaluate(
+        X_test.loc[:, X_test.columns != "Class"], y_test.loc[:, y_test.columns != "Class"])[2]
+    prediction_mass_balance_non_reactive = model.evaluate(
+        X_test[X_test['Class'] == 0].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 0].iloc[:, y_test.columns != "Class"])[2]
+    prediction_mass_balance_reactive = model.evaluate(
+        X_test[X_test['Class'] == 1].iloc[:, X_test.columns != "Class"], y_test[X_test['Class'] == 1].iloc[:, y_test.columns != "Class"])[2]
+    
+    mass_balance_results = mass_balance_evaluation(model, X_test, preprocess)
+    mass_balance_ratio = len(mass_balance_results[mass_balance_results < 1e-5]) / len(mass_balance_results)
+    
+    results_save_path = os.path.join("./results/", "results.csv")
+    results_df = pd.DataFrame({
+        "trial": [trial.number],
+        "prediction_huber_overall": [prediction_huber_overall],
+        "prediction_huber_non_reactive": [prediction_huber_non_reactive],
+        "prediction_huber_reactive": [prediction_huber_reactive],
+        "prediction_mass_balance_overall": [prediction_mass_balance_overall],
+        "prediction_mass_balance_non_reactive": [prediction_mass_balance_non_reactive],
+        "prediction_mass_balance_reactive": [prediction_mass_balance_reactive]
+    })
+    
+    if not os.path.isfile(results_save_path):
+        results_df.to_csv(results_save_path, index=False)
+    else:
+        results_df.to_csv(results_save_path, mode='a', header=False, index=False)
+    
+    
+    model_save_path_trial = os.path.join("./results/models/", f"model_trial_{trial.number}.keras")
+    history_save_path_trial = os.path.join("./results/history/", f"history_trial_{trial.number}.pkl")
+
+    model.save(model_save_path_trial)
+    with open(history_save_path_trial, 'wb') as f:
+        pickle.dump(history.history, f)
+
+    return prediction_huber_overall, mass_balance_ratio
+
+if __name__ == "__main__":
+
+    
+    print(os.path.abspath("./datasets/barite_50_4_corner.h5"))
+    data_file = h5py.File("./datasets/barite_50_4_corner.h5")
+    design = data_file["design"]
+    results = data_file["result"]
+
+    df_design = pd.DataFrame(np.array(design["data"]).transpose(), columns = np.array(design["names"].asstr()))
+    df_results = pd.DataFrame(np.array(results["data"]).transpose(), columns = np.array(results["names"].asstr()))
+
+    data_file.close()
+
+    species_columns = ['H', 'O', 'Ba', 'Cl', 'S', 'Sr', 'Barite', 'Celestite']
+    
+    study = optuna.create_study(storage="sqlite:///model_large_optimization.db", study_name="model_optimization", directions=["minimize", "maximize"])
+    study.optimize(lambda trial: objective(trial, df_design, df_results, species_columns), n_trials=1000)
+
+    print("Number of finished trials: ", len(study.trials))
+
+    print("Best trial:")
+    trial = study.best_trial
+
+    print("  Value: ", trial.value)
+
+    print("  Params: ")
+    for key, value in trial.params.items():
+        print("    {}: {}".format(key, value))
--- a/src/preprocessing.py
+++ b/src/preprocessing.py
@ -0,0 +1,660 @@
+import keras
+from keras.layers import Dense, Dropout, Input, BatchNormalization, LeakyReLU
+import tensorflow as tf
+import h5py
+import numpy as np
+import pandas as pd
+import time
+import sklearn.model_selection as sk
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from imblearn.over_sampling import SMOTE
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.over_sampling import RandomOverSampler
+from collections import Counter
+import os
+from preprocessing import *
+from sklearn import set_config
+from importlib import reload
+
+set_config(transform_output="pandas")
+
+
+def model_definition(architecture):
+    """Definition of the respective AI model. Three models are currently being analysed, which are labelled ‘small’, ‘large’ or ‘paper’.
+
+    Args:
+        architecture (String): Choose between 'small', 'large' or 'paper'.
+    Returns:
+        keras model: Returns the respective model.
+    """
+    dtype = "float32"
+
+    if architecture == "small":
+        model = keras.Sequential(
+            [
+                keras.Input(shape=(8,), dtype=dtype),
+                keras.layers.Dense(units=128, dtype=dtype),
+                LeakyReLU(negative_slope=0.01),
+                # Dropout(0.2),
+                keras.layers.Dense(units=128, dtype=dtype),
+                LeakyReLU(negative_slope=0.01),
+                keras.layers.Dense(units=8, dtype=dtype),
+            ]
+        )
+
+    elif architecture == "large":
+        model = keras.Sequential(
+            [
+                keras.layers.Input(shape=(8,), dtype=dtype),
+                keras.layers.Dense(512, dtype=dtype),
+                LeakyReLU(negative_slope=0.01),
+                keras.layers.Dense(1024, dtype=dtype),
+                LeakyReLU(negative_slope=0.01),
+                keras.layers.Dense(512, dtype=dtype),
+                LeakyReLU(negative_slope=0.01),
+                keras.layers.Dense(8, dtype=dtype),
+            ]
+        )
+
+    elif architecture == "paper":
+        model = keras.Sequential(
+            [
+                keras.layers.Input(shape=(8,), dtype=dtype),
+                keras.layers.Dense(128, dtype=dtype),
+                LeakyReLU(negative_slope=0.01),
+                keras.layers.Dense(256, dtype=dtype),
+                LeakyReLU(negative_slope=0.01),
+                keras.layers.Dense(512, dtype=dtype),
+                LeakyReLU(negative_slope=0.01),
+                keras.layers.Dense(256, dtype=dtype),
+                LeakyReLU(negative_slope=0.01),
+                keras.layers.Dense(8, dtype=dtype),
+            ]
+        )
+
+    else:
+        raise Exception(
+            "No valid architecture found."
+            + "Choose between 'small', 'large' or 'paper'."
+        )
+
+    return model
+
+
+@keras.saving.register_keras_serializable()
+def custom_loss(
+    preprocess,
+    column_dict,
+    h1,
+    h2,
+    h3,
+    scaler_type="minmax",
+    loss_variant="huber",
+    delta=1.0,
+):
+    """
+    Custom tensorflow loss function to combine Huber Loss with mass balance.
+    This is inspired by PINN (Physics Informed Neural Networks) where the loss function is a combination of the physics-based loss and the data-driven loss.
+    The mass balance is a physics-based loss that ensures the conservation of mass in the system.
+    A tensorflow loss function accepts only the two arguments y_true and y_pred. Therefore, a nested function is used to pass the additional arguments.
+
+    Args:
+        preprocess: preprocessing object
+        column_dict: dictionary with the column names as keys and the corresponding index as values. 
+        (i.e {'H': 0, 'O': 1, 'Ba': 2, 'Cl': 3, 'S': 4, 'Sr': 5, 'Barite': 6, 'Celestite': 7})
+        h1: hyperparameter for the importance of the huber loss
+        h2: hyperparameter for the importance of the Barium mass balance term
+        h3: hyperparameter for the importance of the Strontium mass balance term
+        scaler_type: Normalization approach. Choose between "standard" and "minmax". Defaults to "minmax".
+        loss_variant: Loss function approach. Choose between "huber and "huber_mass_balance". Defaults to "huber".
+        delta: Hyperparameter for the Huber function threshold. Defaults to 1.0.
+
+    Returns:
+        loss function
+    """
+
+    # as far as I know tensorflow does not directly support the use of scaler objects
+    # therefore, the backtransformation is done manually
+    try:
+        if scaler_type == "minmax":
+            scale_X = tf.convert_to_tensor(
+                preprocess.scaler_X.data_range_, dtype=tf.float32
+            )
+            min_X = tf.convert_to_tensor(
+                preprocess.scaler_X.data_min_, dtype=tf.float32
+            )
+            scale_y = tf.convert_to_tensor(
+                preprocess.scaler_y.data_range_, dtype=tf.float32
+            )
+            min_y = tf.convert_to_tensor(
+                preprocess.scaler_y.data_min_, dtype=tf.float32
+            )
+
+        elif scaler_type == "standard":
+            scale_X = tf.convert_to_tensor(
+                preprocess.scaler_X.scale_, dtype=tf.float32)
+            mean_X = tf.convert_to_tensor(
+                preprocess.scaler_X.mean_, dtype=tf.float32)
+            scale_y = tf.convert_to_tensor(
+                preprocess.scaler_y.scale_, dtype=tf.float32)
+            mean_y = tf.convert_to_tensor(
+                preprocess.scaler_y.mean_, dtype=tf.float32)
+
+        else:
+            raise Exception(
+                "No valid scaler type found. Choose between 'standard' and 'minmax'."
+            )
+
+    except AttributeError:
+        raise Exception(
+            "Data normalized with scaler different than specified for the training. Compare the scaling approach on preprocessing and training."
+        )
+
+    def loss(results, predicted):
+        # inverse min/max scaling
+        if scaler_type == "minmax":
+            predicted_inverse = predicted * scale_y + min_y
+            results_inverse = results * scale_X + min_X
+
+        # inverse standard scaling
+        elif scaler_type == "standard":
+            predicted_inverse = predicted * scale_y + mean_y
+            results_inverse = results * scale_X + mean_X
+
+        # apply exp1m on the columns of predicted_inverse and results_inverse if log transformation was used
+        if preprocess.func_dict_out is not None:
+            predicted_inverse = tf.math.expm1(predicted_inverse)
+            results_inverse = tf.math.expm1(results_inverse)
+
+        # mass balance
+        # in total no Barium and Strontium should be lost in one simulation step
+        dBa = tf.keras.backend.abs(
+            (
+                predicted_inverse[:, column_dict["Ba"]]
+                + predicted_inverse[:, column_dict["Barite"]]
+            )
+            - (
+                results_inverse[:, column_dict["Ba"]]
+                + results_inverse[:, column_dict["Barite"]]
+            )
+        )
+        dSr = tf.keras.backend.abs(
+            (
+                predicted_inverse[:, column_dict["Sr"]]
+                + predicted_inverse[:, column_dict["Celestite"]]
+            )
+            - (
+                results_inverse[:, column_dict["Sr"]]
+                + results_inverse[:, column_dict["Celestite"]]
+            )
+        )
+
+        # huber loss
+        huber_loss = tf.keras.losses.Huber(delta)(results, predicted)
+
+        # total loss
+        if loss_variant == "huber":
+            total_loss = huber_loss
+        elif loss_variant == "huber_mass_balance":
+            total_loss = h1 * huber_loss + h2 * dBa + h3 * dSr
+        else:
+            raise Exception(
+                "No valid loss variant found. Choose between 'huber' and 'huber_mass_balance'."
+            )
+
+        return total_loss
+
+    return loss
+
+
+def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
+    """Auxilary function to calculate the mass balance during training.
+
+    Args:
+        preprocess: preprocessing object
+        column_dict: dictionary with the column names as keys and the corresponding index as values
+        scaler_type: Normalization approach. Choose between "standard" and "minmax". Defaults to "minmax".
+
+    Returns:
+        mean of both mass balance terms
+    """
+
+    if scaler_type == "minmax":
+        scale_X = tf.convert_to_tensor(
+            preprocess.scaler_X.data_range_, dtype=tf.float32
+        )
+        min_X = tf.convert_to_tensor(
+            preprocess.scaler_X.data_min_, dtype=tf.float32)
+        scale_y = tf.convert_to_tensor(
+            preprocess.scaler_y.data_range_, dtype=tf.float32
+        )
+        min_y = tf.convert_to_tensor(
+            preprocess.scaler_y.data_min_, dtype=tf.float32)
+
+    elif scaler_type == "standard":
+        scale_X = tf.convert_to_tensor(
+            preprocess.scaler_X.scale_, dtype=tf.float32)
+        mean_X = tf.convert_to_tensor(
+            preprocess.scaler_X.mean_, dtype=tf.float32)
+        scale_y = tf.convert_to_tensor(
+            preprocess.scaler_y.scale_, dtype=tf.float32)
+        mean_y = tf.convert_to_tensor(
+            preprocess.scaler_y.mean_, dtype=tf.float32)
+
+    def mass_balance(results, predicted):
+        # inverse min/max scaling
+        if scaler_type == "minmax":
+            predicted_inverse = predicted * scale_y + min_y
+            results_inverse = results * scale_X + min_X
+
+        elif scaler_type == "standard":
+            predicted_inverse = predicted * scale_y + mean_y
+            results_inverse = results * scale_X + mean_X
+
+        if preprocess.func_dict_out is not None:
+            predicted_inverse = tf.math.expm1(predicted_inverse)
+            results_inverse = tf.math.expm1(results_inverse)
+
+        # mass balance
+        dBa = tf.keras.backend.abs(
+            (
+                predicted_inverse[:, column_dict["Ba"]]
+                + predicted_inverse[:, column_dict["Barite"]]
+            )
+            - (
+                results_inverse[:, column_dict["Ba"]]
+                + results_inverse[:, column_dict["Barite"]]
+            )
+        )
+        dSr = tf.keras.backend.abs(
+            (
+                predicted_inverse[:, column_dict["Sr"]]
+                + predicted_inverse[:, column_dict["Celestite"]]
+            )
+            - (
+                results_inverse[:, column_dict["Sr"]]
+                + results_inverse[:, column_dict["Celestite"]]
+            )
+        )
+        return tf.reduce_mean(dBa + dSr)
+
+    return mass_balance
+
+
+def huber_metric(delta=1.0):
+    """Auxilary function to calculate the Huber loss during training.
+
+    Args:
+        preprocess (_type_): _description_
+        scaler_type (str, optional): _description_. Defaults to "minmax".
+        delta (float, optional): _description_. Defaults to 1.0.
+    """
+
+    def huber(results, predicted):
+        huber_loss = tf.keras.losses.Huber(delta)(results, predicted)
+        return huber_loss
+
+    return huber
+
+
+def mass_balance_evaluation(model, X, preprocess):
+    """Calculates the mass balance difference for each cell.
+
+    Args:
+        model: trained model
+        X: data where the mass balance should be calculated
+        preprocess: preprocessing object
+
+    Returns:
+        vector with the mass balance difference for each cell
+    """
+
+    # predict the chemistry
+    columns = X.iloc[:, X.columns != "Class"].columns
+    classes = X["Class"]
+    classes.reset_index(drop=True, inplace=True)
+    prediction = pd.DataFrame(model.predict(X[columns]), columns=columns)
+    # backtransform min/max or standard scaler
+    X = pd.DataFrame(
+        preprocess.scaler_X.inverse_transform(X.iloc[:, X.columns != "Class"]),
+        columns=columns,
+    )
+    prediction = pd.DataFrame(
+        preprocess.scaler_y.inverse_transform(prediction), columns=columns
+    )
+
+    # apply backtransformation if log transformation was applied
+    if preprocess.func_dict_out is not None:
+        X = preprocess.funcInverse(X)[0]
+        prediction = preprocess.funcInverse(prediction)[0]
+
+    # calculate mass balance
+    dBa = np.abs(
+        (prediction["Ba"] + prediction["Barite"]) - (X["Ba"] + X["Barite"]))
+    dSr = np.abs(
+        (prediction["Sr"] + prediction["Celestite"]) -
+        (X["Sr"] + X["Celestite"])
+    )
+
+    mass_balance_result = pd.DataFrame(
+        {"dBa": dBa, "dSr": dSr, "mass_balance": dBa + dSr, "Class": classes}
+    )
+
+    return mass_balance_result
+
+
+def mass_balance_ratio(results, threshold=1e-5):
+    proportion = {}
+
+    mass_balance_threshold = results[results["mass_balance"] <= threshold]
+
+    overall = len(mass_balance_threshold)
+    class_0_amount = len(
+        mass_balance_threshold[mass_balance_threshold["Class"] == 0])
+    class_1_amount = len(
+        mass_balance_threshold[mass_balance_threshold["Class"] == 1])
+
+    proportion["overall"] = overall / len(results)
+    proportion["class_0"] = class_0_amount / \
+        len(results[results["Class"] == 0])
+    proportion["class_1"] = class_1_amount / \
+        len(results[results["Class"] == 1])
+
+    return proportion
+
+
+class preprocessing:
+    """
+    A class used to preprocess data for model training.
+    Attributes
+    """
+
+    def __init__(self, func_dict_in=None, func_dict_out=None, random_state=42):
+        """Initialization of the preprocessing object.
+
+        Args:
+            func_dict_in: function for transformation. Defaults to None.
+            func_dict_out: function for backtransformation. Defaults to None.
+            random_state (int, optional): Seed for reproducability. Defaults to 42.
+        """
+        self.random_state = random_state
+        self.scaler_X = None
+        self.scaler_y = None
+        self.func_dict_in = func_dict_in if func_dict_in is not None else None
+        self.func_dict_out = func_dict_out if func_dict_out is not None else None
+        self.state = {"cluster": False, "log": False,
+                      "balance": False, "scale": False}
+
+    def funcTranform(self, *args):
+        """Apply the transformation function to the data columnwise.
+
+        Returns:
+            pandas data frame: transformed data
+        """
+        for i in args:
+            for key in i.keys():
+                if "Class" not in key:
+                    i[key] = i[key].apply(self.func_dict_in)
+        self.state["log"] = True
+        return args
+
+    def funcInverse(self, *args):
+        """Apply the backtransformation function to the data columnwise.
+
+        Returns:
+            pandas data frame: backtransformed data
+        """
+        for i in args:
+            for key in i.keys():
+                if "Class" not in key:
+                    i[key] = i[key].apply(self.func_dict_out)
+        self.state["log"] = False
+        return args
+
+    def cluster(self, X, y, species="Barite", n_clusters=2, x_length=50, y_length=50):
+        """Apply k-means clustering to the data to differentiate betweeen reactive and non-reactive cells.
+
+        Args:
+            X: design data set
+            y: target data set
+            species (str, optional): Chemical species to which clustering is be applied. Defaults to "Barite".
+            n_clusters (int, optional): Number of clusters. Defaults to 2.
+            x_length: x dimension of the grid. Defaults to 50.
+            y_length: y dimension of the grid. Defaults to 50.
+
+        Returns:
+            X, y dataframes with an additional column "Class" containing the cluster labels.
+        """
+        class_labels = np.array([])
+        grid_length = x_length * y_length
+        iterations = int(len(X) / grid_length)
+
+        # calculate the cluster for each chemical iteration step
+        for i in range(0, iterations):
+            field = np.array(
+                X[species][(i * grid_length): (i * grid_length + grid_length)]
+            ).reshape(x_length, y_length)
+            kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state).fit(
+                field.reshape(-1, 1)
+            )
+            class_labels = np.append(class_labels.astype(int), kmeans.labels_)
+
+        if "Class" in X.columns and "Class" in y.columns:
+            print("Class column already exists")
+        else:
+            class_labels_df = pd.DataFrame(class_labels, columns=["Class"])
+            X = pd.concat([X, class_labels_df], axis=1)
+            y = pd.concat([y, class_labels_df], axis=1)
+            self.state["cluster"] = True
+
+        return X, y
+
+    def balancer(self, X, y, strategy, sample_fraction=0.5):
+        """Apply sampling strategies to balance the dataset.
+
+        Args:
+            X: design dataset (before the simulation)
+            y: target dataset (after the simulation)
+            strategy: Sampling strategy. Choose between "smote" (Synthetic Minority Oversampling Technique), "over" (Oversampling) and "under" (Undersampling).
+            sample_fraction (float, optional): Define balancer target. Specifies the target fraction of the minority class after the balancing step. Defaults to 0.5.
+
+        Returns:
+            X, y: resampled datasets
+        """
+        number_features = (X.columns != "Class").sum()
+        if "Class" not in X.columns:
+            if "Class" in y.columns:
+                classes = y["Class"]
+            else:
+                raise Exception("No class column found")
+        else:
+            classes = X["Class"]
+            counter = classes.value_counts()
+            print("Amount class 0 before:",
+                  counter[0] / (counter[0] + counter[1]))
+            print("Amount class 1 before:",
+                  counter[1] / (counter[0] + counter[1]))
+            df = pd.concat(
+                [
+                    X.loc[:, X.columns != "Class"],
+                    y.loc[:, y.columns != "Class"],
+                    classes,
+                ],
+                axis=1,
+            )
+
+        if strategy == "smote":
+            print("Using SMOTE strategy")
+            smote = SMOTE(sampling_strategy=sample_fraction)
+            df_resampled, classes_resampled = smote.fit_resample(
+                df.loc[:, df.columns != "Class"], df.loc[:,
+                                                         df.columns == "Class"]
+            )
+
+        elif strategy == "over":
+            print("Using Oversampling")
+            over = RandomOverSampler()
+            df_resampled, classes_resampled = over.fit_resample(
+                df.loc[:, df.columns != "Class"], df.loc[:,
+                                                         df.columns == "Class"]
+            )
+
+        elif strategy == "under":
+            print("Using Undersampling")
+            under = RandomUnderSampler()
+            df_resampled, classes_resampled = under.fit_resample(
+                df.loc[:, df.columns != "Class"], df.loc[:,
+                                                         df.columns == "Class"]
+            )
+
+        else:
+            print("No sampling selected. Output equals input.")
+            return X, y
+
+        counter = classes_resampled["Class"].value_counts()
+        print("Amount class 0 after:", counter[0] / (counter[0] + counter[1]))
+        print("Amount class 1 after:", counter[1] / (counter[0] + counter[1]))
+
+        design_resampled = pd.concat(
+            [df_resampled.iloc[:, 0:number_features], classes_resampled], axis=1
+        )
+        target_resampled = pd.concat(
+            [df_resampled.iloc[:, number_features:], classes_resampled], axis=1
+        )
+
+        self.state["balance"] = True
+        return design_resampled, target_resampled
+
+    def scale_fit(self, X, y, scaling, type="Standard"):
+        """Fit a scaler for data preprocessing.
+
+        Args:
+            X: design dataset
+            y: target dataset
+            scaling: learn individual scaler for X and y when "individual" is selected or one global scaler on all data in X and y if "global" is selected (scaler_X and scaler_y are equal)
+            type (str, optional): Using MinMax Scaling or Standarization. Defaults to "Standard".
+        """
+
+        if type == "minmax":
+            self.scaler_X = MinMaxScaler()
+            self.scaler_y = MinMaxScaler()
+        elif type == "standard":
+            self.scaler_X = StandardScaler()
+            self.scaler_y = StandardScaler()
+
+        else:
+            raise Exception("No valid scaler type found")
+
+        if scaling == "individual":
+            self.scaler_X.fit(X.iloc[:, X.columns != "Class"])
+            self.scaler_y.fit(y.iloc[:, y.columns != "Class"])
+
+        elif scaling == "global":
+            self.scaler_X.fit(
+                pd.concat(
+                    [X.iloc[:, X.columns != "Class"],
+                        y.iloc[:, y.columns != "Class"]],
+                    axis=0,
+                )
+            )
+            self.scaler_y = self.scaler_X
+
+        self.state["scale"] = True
+
+    def scale_transform(self, X_train, X_test, y_train, y_test):
+        """Apply learned scaler on datasets.
+
+        Args:
+            X_train: design training data
+            X_test: test training data
+            y_train: target training data
+            y_test: test target data
+
+        Returns:
+            transformed dataframes
+        """
+
+        X_train = pd.concat(
+            [
+                self.scaler_X.transform(
+                    X_train.loc[:, X_train.columns != "Class"]),
+                X_train.loc[:, "Class"],
+            ],
+            axis=1,
+        )
+
+        X_test = pd.concat(
+            [
+                self.scaler_X.transform(
+                    X_test.loc[:, X_test.columns != "Class"]),
+                X_test.loc[:, "Class"],
+            ],
+            axis=1,
+        )
+
+        y_train = pd.concat(
+            [
+                self.scaler_y.transform(
+                    y_train.loc[:, y_train.columns != "Class"]),
+                y_train.loc[:, "Class"],
+            ],
+            axis=1,
+        )
+
+        y_test = pd.concat(
+            [
+                self.scaler_y.transform(
+                    y_test.loc[:, y_test.columns != "Class"]),
+                y_test.loc[:, "Class"],
+            ],
+            axis=1,
+        )
+
+        return X_train, X_test, y_train, y_test
+
+    def scale_inverse(self, *args):
+        """Backtransform the dataset
+
+        Returns:
+            Backtransformed data frames
+        """
+
+        result = []
+        for i in args:
+            if "Class" in i.columns:
+                inversed = pd.DataFrame(
+                    self.scaler_X.inverse_transform(
+                        i.loc[:, i.columns != "Class"]),
+                    columns=i.columns[:-1],
+                )
+                class_column = i.loc[:, "Class"].reset_index(drop=True)
+                i = pd.concat([inversed, class_column], axis=1)
+            else:
+                i = pd.DataFrame(
+                    self.scaler_X.inverse_transform(i), columns=i.columns)
+            result.append(i)
+        return result
+
+    def split(self, X, y, ratio=0.8):
+        X_train, y_train, X_test, y_test = sk.train_test_split(
+            X, y, test_size=ratio, random_state=self.random_state
+        )
+
+        return X_train, y_train, X_test, y_test
+
+    def class_selection(self, *args, class_label=0):
+        """Select only rows with specific class label
+
+        Args:
+            Dataframes where rows with specific label should be selected. Defaults to 0.
+
+        Returns:
+            Elements with selected class label.
+        """
+        for i in args:
+            i = i[i["Class"] == class_label]
+
+        return args
Author	SHA1	Message	Date
Hannes Signer	4fe35855f0	Merge branch 'main' of git.gfz-potsdam.de:naaice/model-training	2025-02-27 12:05:31 +01:00
Hannes Signer	91fd34bce0	Merge branch 'loss-experiment'	2025-02-27 12:04:13 +01:00
Hannes Martin Signer	1d9d193668	update julia script	2025-02-27 11:44:51 +01:00
Hannes Martin Signer	5fee700eae	Merge branch 'log-scale-error' into 'loss-experiment' Solve scaling error with min max scaling and restructure notebook as well as preprocessing functions See merge request naaice/model-training!3	2025-02-27 11:42:05 +01:00
Hannes Signer	471038de50	update linting	2025-02-26 18:51:40 +01:00
Hannes Signer	5f4c863b39	first draft for students	2025-02-26 18:32:22 +01:00
Hannes Signer	331306c141	update mass_balance_evaluation	2025-02-26 18:32:11 +01:00
Hannes Signer	cce2f696a0	document functions	2025-02-26 12:57:39 +01:00
Hannes Signer	03918222c4	test with losses	2025-02-25 18:36:11 +01:00
Hannes Signer	0f7ee78a8a	correction of scaling error in mass balance loss	2025-02-25 18:06:56 +01:00
Hannes Signer	8051eb3c3d	investigate scaling error	2025-02-21 16:57:55 +01:00
Hannes Signer	c0ca00271e	adapt notebook	2025-02-21 09:42:33 +01:00
Hannes Signer	5ffcd75adc	update test samples	2025-02-19 17:58:47 +01:00
Hannes Signer	df7cabfb31	update notebook to new training structure	2025-02-18 20:47:24 +01:00
Hannes Signer	1381e9a879	add first models	2025-02-18 20:28:43 +01:00
Hannes Signer	7642d22d4d	add 4 corner small dataset	2025-02-18 20:08:13 +01:00
Hannes Signer	3bf4ff42a0	add dataset	2025-02-18 20:06:23 +01:00
Hannes Signer	bf0dfcb690	delete cache	2025-02-18 19:55:13 +01:00
Hannes Signer	c447a6c84a	Merge branch 'loss-experiment' of git.gfz-potsdam.de:naaice/model-training into loss-experiment	2025-02-18 17:16:16 +01:00
Hannes Signer	1a3787c530	add measurement plan	2025-02-18 17:08:50 +01:00
Hannes Signer	b58e8869ad	prepare optimization study	2025-02-18 17:05:24 +01:00
Hannes Signer	052cf356c9	restructure project	2025-02-18 14:41:41 +01:00
Hannes Signer	019a426af4	update gitignore	2025-02-17 22:23:48 +01:00
Hannes Signer	4a86161cce	test combined loss functions	2025-02-17 22:23:28 +01:00
Hannes Signer	986191f72c	update training	2025-02-17 18:13:57 +01:00
Hannes Signer	d27e9e9c5d	add optimizer framework	2025-02-17 00:11:36 +01:00
Hannes Signer	3171dd3643	custom loss function	2025-02-14 16:33:32 +01:00
Hannes Signer	e9f49308f2	further tests	2025-02-13 17:11:45 +01:00
Hannes Signer	4d7502c309	Merge branch 'loss-experiment' of git.gfz-potsdam.de:naaice/model-training into loss-experiment	2025-02-11 17:22:13 +01:00
Hannes Signer	a0145576f4	add new model architecture	2025-02-11 17:17:43 +01:00
Hannes Signer	3eb46f6c26	further experiments	2025-01-24 16:28:56 +01:00
Hannes Signer	67b1f23fa3	update preprocessing	2025-01-23 17:51:26 +01:00
Hannes Signer	78a57654a3	encapsulate functionality	2025-01-22 17:36:06 +01:00
Hannes Signer	a4c419c618	add template class	2025-01-21 17:36:41 +01:00
Hannes Signer	81d35796f8	add custom scaler steps	2025-01-20 19:40:57 +01:00
Hannes Signer	f0a4ddddc6	add new dataset	2025-01-20 10:25:27 +01:00
Hannes Signer	d517ca6540	add sampling function	2025-01-15 17:23:58 +01:00
Hannes Signer	bea094be35	update sampling methods	2025-01-15 16:04:14 +01:00
Hannes Signer	6872dc1aed	add normal oversampling	2025-01-15 14:11:13 +01:00
Hannes Signer	78cd2f2083	add data conversion script	2025-01-15 12:02:54 +01:00
Hannes Signer	66fd10c19e	add SMORT sampling	2025-01-15 12:01:22 +01:00
Hannes Signer	bb94da79d7	add data conversion script	2025-01-15 12:00:30 +01:00
Hannes Signer	4ee967f613	add SMOT sampling strategy	2025-01-15 11:50:05 +01:00
Hannes Signer	60f0165df1	add SMOT sampling strategy	2025-01-15 11:49:01 +01:00
Hannes Signer	f89228de06	add kmeans clustering	2025-01-14 17:48:41 +01:00