adapt notebook for new dataset

This commit is contained in:
Hannes Signer 2025-03-26 13:35:17 +01:00
parent c7b89505c2
commit 52940efdb9
2 changed files with 233 additions and 458 deletions

File diff suppressed because one or more lines are too long

View File

@ -21,7 +21,7 @@ from importlib import reload
set_config(transform_output="pandas") set_config(transform_output="pandas")
def model_definition(architecture): def model_definition(architecture, n_input, n_output):
"""Definition of the respective AI model. Three models are currently being analysed, which are labelled small, large or paper. """Definition of the respective AI model. Three models are currently being analysed, which are labelled small, large or paper.
Args: Args:
@ -34,66 +34,71 @@ def model_definition(architecture):
if architecture == "small": if architecture == "small":
model = keras.Sequential( model = keras.Sequential(
[ [
keras.Input(shape=(8,), dtype=dtype), keras.Input(shape=(n_input,), dtype=dtype),
keras.layers.Dense(units=128, dtype=dtype), keras.layers.Dense(units=128, dtype=dtype),
LeakyReLU(negative_slope=0.01), LeakyReLU(negative_slope=0.01),
# Dropout(0.2), # Dropout(0.2),
keras.layers.Dense(units=128, dtype=dtype), keras.layers.Dense(units=128, dtype=dtype),
LeakyReLU(negative_slope=0.01), LeakyReLU(negative_slope=0.01),
keras.layers.Dense(units=8, dtype=dtype), keras.layers.Dense(units=n_output, dtype=dtype),
] ]
) )
elif architecture == "large": elif architecture == "large":
model = keras.Sequential( model = keras.Sequential(
[ [
keras.layers.Input(shape=(8,), dtype=dtype), keras.layers.Input(shape=(n_input,), dtype=dtype),
keras.layers.Dense(512, dtype=dtype), keras.layers.Dense(512, dtype=dtype),
LeakyReLU(negative_slope=0.01), LeakyReLU(negative_slope=0.01),
keras.layers.Dense(1024, dtype=dtype), keras.layers.Dense(1024, dtype=dtype),
LeakyReLU(negative_slope=0.01), LeakyReLU(negative_slope=0.01),
keras.layers.Dense(512, dtype=dtype), keras.layers.Dense(512, dtype=dtype),
LeakyReLU(negative_slope=0.01), LeakyReLU(negative_slope=0.01),
keras.layers.Dense(8, dtype=dtype), keras.layers.Dense(n_output, dtype=dtype),
] ]
) )
elif architecture == "large_batch_normalization": elif architecture == "large_batch_normalization":
model = keras.Sequential([ model = keras.Sequential([
keras.layers.Input(shape=(8,), dtype=dtype), keras.layers.Input(shape=(n_input,), dtype=dtype),
BatchNormalization(), BatchNormalization(),
Dense(512, dtype=dtype), Dense(512, dtype=dtype),
LeakyReLU(negative_slope=0.01), LeakyReLU(negative_slope=0.01),
# BatchNormalization(), Dropout(0.05),
BatchNormalization(),
Dense(1024, dtype=dtype), Dense(1024, dtype=dtype),
LeakyReLU(negative_slope=0.01), LeakyReLU(negative_slope=0.01),
# BatchNormalization(), Dropout(0.05),
BatchNormalization(),
Dense(512, dtype=dtype), Dense(512, dtype=dtype),
LeakyReLU(negative_slope=0.01), Dropout(0.05),
LeakyReLU(negative_slope=0.01),
Dense(8, dtype=dtype), Dense(n_output, dtype=dtype),
]) ])
elif architecture == "large_self_normalization": elif architecture == "large_self_normalization":
model = keras.Sequential([ model = keras.Sequential([
keras.layers.Input(shape=(8,), dtype=dtype), keras.layers.Input(shape=(n_input,), dtype=dtype),
Dense(512, activation='selu', kernel_initializer='lecun_normal', dtype=dtype), Dense(512, activation='selu',
AlphaDropout(0.05), kernel_initializer='lecun_normal', dtype=dtype),
Dense(1024, activation='selu', kernel_initializer='lecun_normal',dtype=dtype), AlphaDropout(0.05),
AlphaDropout(0.05), Dense(1024, activation='selu',
Dense(512, activation='selu', kernel_initializer='lecun_normal',dtype=dtype), kernel_initializer='lecun_normal', dtype=dtype),
AlphaDropout(0.05), AlphaDropout(0.05),
Dense(8, dtype=dtype), Dense(512, activation='selu',
kernel_initializer='lecun_normal', dtype=dtype),
AlphaDropout(0.05),
Dense(n_output, dtype=dtype),
]) ])
elif architecture == "paper": elif architecture == "paper":
model = keras.Sequential( model = keras.Sequential(
[ [
keras.layers.Input(shape=(8,), dtype=dtype), keras.layers.Input(shape=(n_input,), dtype=dtype),
keras.layers.Dense(128, dtype=dtype), keras.layers.Dense(128, dtype=dtype),
LeakyReLU(negative_slope=0.01), LeakyReLU(negative_slope=0.01),
keras.layers.Dense(256, dtype=dtype), keras.layers.Dense(256, dtype=dtype),
@ -102,14 +107,13 @@ def model_definition(architecture):
LeakyReLU(negative_slope=0.01), LeakyReLU(negative_slope=0.01),
keras.layers.Dense(256, dtype=dtype), keras.layers.Dense(256, dtype=dtype),
LeakyReLU(negative_slope=0.01), LeakyReLU(negative_slope=0.01),
keras.layers.Dense(8, dtype=dtype), keras.layers.Dense(n_output, dtype=dtype),
] ]
) )
else: else:
raise Exception( raise Exception(
"No valid architecture found." "No valid architecture found."
+ "Choose between 'small', 'large' or 'paper'."
) )
return model return model
@ -139,7 +143,7 @@ def custom_loss(
h1: hyperparameter for the importance of the huber loss h1: hyperparameter for the importance of the huber loss
h2: hyperparameter for the importance of the Barium mass balance term h2: hyperparameter for the importance of the Barium mass balance term
h3: hyperparameter for the importance of the Strontium mass balance term h3: hyperparameter for the importance of the Strontium mass balance term
scaler_type: Normalization approach. Choose between "standard" and "minmax". Defaults to "minmax". scaler_type: Normalization approach. Choose between "standard", "minmax" and "none". Defaults to "minmax".
loss_variant: Loss function approach. Choose between "huber and "huber_mass_balance". Defaults to "huber". loss_variant: Loss function approach. Choose between "huber and "huber_mass_balance". Defaults to "huber".
delta: Hyperparameter for the Huber function threshold. Defaults to 1.0. delta: Hyperparameter for the Huber function threshold. Defaults to 1.0.
@ -149,40 +153,34 @@ def custom_loss(
# as far as I know tensorflow does not directly support the use of scaler objects # as far as I know tensorflow does not directly support the use of scaler objects
# therefore, the backtransformation is done manually # therefore, the backtransformation is done manually
if preprocess.scaler_type != scaler_type: if preprocess.scaler_type != scaler_type:
raise Exception( raise Exception(
"Data normalized with scaler different than specified for the training. Compare the scaling approach on preprocessing and training.") "Data normalized with scaler different than specified for the training. Compare the scaling approach on preprocessing and training. Scaler type on preprocessing was {0} and on training {1}.".format(
preprocess.scaler_type, scaler_type))
try:
if scaler_type == "minmax":
scale_X = tf.convert_to_tensor(
preprocess.scaler_X.data_range_, dtype=tf.float32
)
min_X = tf.convert_to_tensor(
preprocess.scaler_X.data_min_, dtype=tf.float32
)
scale_y = tf.convert_to_tensor(
preprocess.scaler_y.data_range_, dtype=tf.float32
)
min_y = tf.convert_to_tensor(
preprocess.scaler_y.data_min_, dtype=tf.float32
)
elif scaler_type == "standard": if scaler_type == "minmax":
scale_X = tf.convert_to_tensor( scale_X = tf.convert_to_tensor(
preprocess.scaler_X.scale_, dtype=tf.float32) preprocess.scaler_X.data_range_, dtype=tf.float32
mean_X = tf.convert_to_tensor(
preprocess.scaler_X.mean_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(
preprocess.scaler_y.scale_, dtype=tf.float32)
mean_y = tf.convert_to_tensor(
preprocess.scaler_y.mean_, dtype=tf.float32)
except AttributeError:
raise Exception(
"Data normalized with scaler different than specified for the training. Compare the scaling approach on preprocessing and training."
) )
min_X = tf.convert_to_tensor(
preprocess.scaler_X.data_min_, dtype=tf.float32
)
scale_y = tf.convert_to_tensor(
preprocess.scaler_y.data_range_, dtype=tf.float32
)
min_y = tf.convert_to_tensor(
preprocess.scaler_y.data_min_, dtype=tf.float32
)
elif scaler_type == "standard":
scale_X = tf.convert_to_tensor(
preprocess.scaler_X.scale_, dtype=tf.float32)
mean_X = tf.convert_to_tensor(
preprocess.scaler_X.mean_, dtype=tf.float32)
scale_y = tf.convert_to_tensor(
preprocess.scaler_y.scale_, dtype=tf.float32)
mean_y = tf.convert_to_tensor(
preprocess.scaler_y.mean_, dtype=tf.float32)
def loss(results, predicted): def loss(results, predicted):
# inverse min/max scaling # inverse min/max scaling
@ -194,7 +192,7 @@ def custom_loss(
elif scaler_type == "standard": elif scaler_type == "standard":
predicted_inverse = predicted * scale_y + mean_y predicted_inverse = predicted * scale_y + mean_y
results_inverse = results * scale_X + mean_X results_inverse = results * scale_X + mean_X
elif scaler_type == "none": elif scaler_type == "none":
predicted_inverse = predicted predicted_inverse = predicted
results_inverse = results results_inverse = results
@ -288,7 +286,7 @@ def mass_balance_metric(preprocess, column_dict, scaler_type="minmax"):
elif scaler_type == "standard": elif scaler_type == "standard":
predicted_inverse = predicted * scale_y + mean_y predicted_inverse = predicted * scale_y + mean_y
results_inverse = results * scale_X + mean_X results_inverse = results * scale_X + mean_X
elif scaler_type == "none": elif scaler_type == "none":
predicted_inverse = predicted predicted_inverse = predicted
results_inverse = results results_inverse = results
@ -357,16 +355,16 @@ def mass_balance_evaluation(model, X, preprocess):
classes.reset_index(drop=True, inplace=True) classes.reset_index(drop=True, inplace=True)
prediction = pd.DataFrame(model.predict(X[columns]), columns=columns) prediction = pd.DataFrame(model.predict(X[columns]), columns=columns)
# backtransform min/max or standard scaler # backtransform min/max or standard scaler
if preprocess.scaler_X is None: if preprocess.scaler_X is None:
X = pd.DataFrame( X = pd.DataFrame(
preprocess.scaler_X.inverse_transform(X.iloc[:, X.columns != "Class"]), preprocess.scaler_X.inverse_transform(
columns=columns, X.iloc[:, X.columns != "Class"]),
columns=columns,
) )
prediction = pd.DataFrame( prediction = pd.DataFrame(
preprocess.scaler_y.inverse_transform(prediction), columns=columns preprocess.scaler_y.inverse_transform(prediction), columns=columns
) )
# apply backtransformation if log transformation was applied # apply backtransformation if log transformation was applied
if preprocess.func_dict_out is not None: if preprocess.func_dict_out is not None:
@ -429,6 +427,7 @@ class preprocessing:
self.func_dict_out = func_dict_out if func_dict_out is not None else None self.func_dict_out = func_dict_out if func_dict_out is not None else None
self.state = {"cluster": False, "log": False, self.state = {"cluster": False, "log": False,
"balance": False, "scale": False} "balance": False, "scale": False}
self.scaler_type = "none"
def funcTranform(self, *args): def funcTranform(self, *args):
"""Apply the transformation function to the data columnwise. """Apply the transformation function to the data columnwise.
@ -456,7 +455,7 @@ class preprocessing:
self.state["log"] = False self.state["log"] = False
return args return args
def cluster(self, X, y, species="Barite", n_clusters=2, x_length=50, y_length=50): def cluster_kmeans(self, X, y, species="Barite", n_clusters=2, x_length=50, y_length=50):
"""Apply k-means clustering to the data to differentiate betweeen reactive and non-reactive cells. """Apply k-means clustering to the data to differentiate betweeen reactive and non-reactive cells.
Args: Args:
@ -470,10 +469,10 @@ class preprocessing:
Returns: Returns:
X, y dataframes with an additional column "Class" containing the cluster labels. X, y dataframes with an additional column "Class" containing the cluster labels.
""" """
class_labels = np.array([]) class_labels = np.array([])
grid_length = x_length * y_length grid_length = x_length * y_length
iterations = int(len(X) / grid_length) iterations = int(len(X) / grid_length)
# calculate the cluster for each chemical iteration step # calculate the cluster for each chemical iteration step
for i in range(0, iterations): for i in range(0, iterations):
field = np.array( field = np.array(
@ -483,16 +482,31 @@ class preprocessing:
field.reshape(-1, 1) field.reshape(-1, 1)
) )
class_labels = np.append(class_labels.astype(int), kmeans.labels_) class_labels = np.append(class_labels.astype(int), kmeans.labels_)
if "Class" in X.columns and "Class" in y.columns: if "Class" in X.columns and "Class" in y.columns:
print("Class column already exists") print("Class column already exists")
else: else:
class_labels_df = pd.DataFrame(class_labels, columns=["Class"]) class_labels_df = pd.DataFrame(class_labels, columns=["Class"])
X = pd.concat([X, class_labels_df], axis=1) X = pd.concat([X, class_labels_df], axis=1)
y = pd.concat([y, class_labels_df], axis=1) y = pd.concat([y, class_labels_df], axis=1)
self.state["cluster"] = True self.state["cluster"] = True
return X, y return X, y
def cluster_manual(self, X, y, species="Cl", threshold=1E-10):
if "Class" in X.columns or "Class" in y.columns:
raise Exception("Class column already exists")
label = np.zeros(len(X))
label[X[species] > threshold] = 1
X["Class"] = label
y["Class"] = label
return X, y
def balancer(self, X, y, strategy, sample_fraction=0.5): def balancer(self, X, y, strategy, sample_fraction=0.5):
"""Apply sampling strategies to balance the dataset. """Apply sampling strategies to balance the dataset.
@ -570,7 +584,7 @@ class preprocessing:
self.state["balance"] = True self.state["balance"] = True
return design_resampled, target_resampled return design_resampled, target_resampled
def scale_fit(self, X, y, scaling, type="Standard"): def scale_fit(self, X, y, scaling, type="standard"):
self.scaler_type = type self.scaler_type = type
"""Fit a scaler for data preprocessing. """Fit a scaler for data preprocessing.
@ -697,7 +711,8 @@ class preprocessing:
Returns: Returns:
Elements with selected class label. Elements with selected class label.
""" """
result = []
for i in args: for i in args:
i = i[i["Class"] == class_label] result.append(i[i["Class"] == class_label])
return args return result