import keras print("Running Keras in version {}".format(keras.__version__)) import h5py import numpy as np import pandas as pd import time import sklearn.model_selection as sk import matplotlib.pyplot as plt from sklearn.cluster import KMeans from imblearn.over_sampling import SMOTE from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import RandomOverSampler from collections import Counter import os from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.base import clone # preprocessing pipeline # def Safelog(val): # get range of vector if val > 0: return np.log10(val) elif val < 0: return -np.log10(-val) else: return 0 def Safeexp(val): if val > 0: return -10 ** -val elif val < 0: return 10 ** val else: return 0 class FuncTransform(): ''' Class to transform and inverse transform data with given functions. Transform and inverse transform functions have to be given as dictionaries in the following format: {'key1': function1, 'key2': function2, ...} ''' def __init__(self, func_transform, func_inverse): self.func_transform = func_transform self.func_inverse = func_inverse def fit(self, X, y=None): return self def transform(self, X, y=None): X = X.copy() for key in X.keys(): if "Class" not in key: X[key] = X[key].apply(self.func_transform[key]) return X def fit_transform(self, X, y=None): self.fit(X) return self.transform(X, y) def inverse_transform(self, X_log): X_log = X_log.copy() for key in X_log.keys(): if "Class" not in key: X_log[key] = X_log[key].apply(self.func_inverse[key]) return X_log def clustering(X, n_clusters=2, random_state=42, x_length=50, y_length=50): ''' Function to cluster data with KMeans. ''' class_labels = np.array([]) grid_length = x_length * y_length iterations = int(len(X) / grid_length) for i in range(0, iterations): field = np.array(X['Barite'][(i*grid_length):(i*grid_length+grid_length)] ).reshape(x_length, y_length) kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit( field.reshape(-1, 1)) class_labels = np.append(class_labels.astype(int), kmeans.labels_) if("Class" in X.columns): print("Class column already exists") else: class_labels_df = pd.DataFrame(class_labels, columns=['Class']) X_clustered = pd.concat([X, class_labels_df], axis=1) return X_clustered def balancer(design, target, strategy, sample_fraction=0.5): number_features = (design.columns != "Class").sum() if("Class" not in design.columns): if("Class" in target.columns): classes = target['Class'] else: raise Exception("No class column found") else: classes = design['Class'] counter = classes.value_counts() print("Amount class 0 before:", counter[0] / (counter[0] + counter[1]) ) print("Amount class 1 before:", counter[1] / (counter[0] + counter[1]) ) df = pd.concat([design.loc[:,design.columns != "Class"], target.loc[:, target.columns != "Class"], classes], axis=1) if strategy == 'smote': print("Using SMOTE strategy") smote = SMOTE(sampling_strategy=sample_fraction) df_resampled, classes_resampled = smote.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.columns == "Class"]) elif strategy == 'over': print("Using Oversampling") over = RandomOverSampler() df_resampled, classes_resampled = over.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.columns == "Class"]) elif strategy == 'under': print("Using Undersampling") under = RandomUnderSampler() df_resampled, classes_resampled = under.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.columns == "Class"]) else: return design, target counter = classes_resampled["Class"].value_counts() print("Amount class 0 after:", counter[0] / (counter[0] + counter[1]) ) print("Amount class 1 after:", counter[1] / (counter[0] + counter[1]) ) design_resampled = pd.concat([df_resampled.iloc[:,0:number_features], classes_resampled], axis=1) target_resampled = pd.concat([df_resampled.iloc[:,number_features:], classes_resampled], axis=1) return design_resampled, target_resampled def plot_simulation(X, timestep, component='Barite', x_length=50, y_length=50): grid_length = x_length * y_length max_iter = int(len(X) / grid_length) if(timestep >= max_iter): raise Exception("timestep is not in the simulation range") plt.imshow(np.array(X[component][(timestep*grid_length):(timestep*grid_length+grid_length)]).reshape(x_length,y_length), interpolation='bicubic', origin='lower') if("Class" in X.columns): plt.contour(np.array(X['Class'][(timestep*grid_length):(timestep*grid_length+grid_length)]).reshape(x_length,y_length), levels=[0.1], colors='red', origin='lower') plt.show() def preprocessing_training(df_design, df_targets, func_dict_in, func_dict_out, sampling, scaling, test_size): df_design = clustering(df_design) df_targets = pd.concat([df_targets, df_design['Class']], axis=1) df_design_log = FuncTransform(func_dict_in, func_dict_out).fit_transform(df_design) df_results_log = FuncTransform(func_dict_in, func_dict_out).fit_transform(df_targets) X_train, X_test, y_train, y_test = sk.train_test_split(df_design_log, df_results_log, test_size = test_size, random_state=42) X_train, y_train = balancer(X_train, y_train, sampling) scaler_X = MinMaxScaler() scaler_y = MinMaxScaler() if scaling == 'individual': scaler_X.fit(X_train.iloc[:, X_train.columns != "Class"]) scaler_y.fit(y_train.iloc[:, y_train.columns != "Class"]) elif scaling == 'global': scaler_X.fit(pd.concat([X_train.iloc[:, X_train.columns != "Class"], y_train.iloc[:, y_train.columns != "Class"]], axis=0)) scaler_y = clone(scaler_X) X_train = pd.concat([scaler_X.transform(X_train.loc[:, X_train.columns != "Class"]), X_train.loc[:, "Class"]], axis=1) X_test = pd.concat([scaler_X.transform(X_test.loc[:, X_test.columns != "Class"]), X_test.loc[:, "Class"]], axis=1) y_train = pd.concat([scaler_y.transform(y_train.loc[:, y_train.columns != "Class"]), y_train.loc[:, "Class"]], axis=1) y_test = pd.concat([scaler_y.transform(y_test.loc[:, y_test.columns != "Class"]), y_test.loc[:, "Class"]], axis=1) X_train, X_val, y_train, y_val = sk.train_test_split(X_train, y_train, test_size = 0.1) return X_train, X_val, X_test, y_train, y_val, y_test, scaler_X, scaler_y class preprocessing: def __init__(self, df_design, df_targets, random_state=42): self.X = df_design self.y = df_targets self.random_state = random_state self.state = {"cluster": False, "log": False, "balance": False, "scale": False} def funcTranform(self, func_dict_in): for key in self.X.keys(): if "Class" not in key: self.X[key] = self.X[key].apply(func_dict_in[key]) self.y[key] = self.y[key].apply(func_dict_in[key]) self.state["log"] = True def funcInverse(self, func_dict_out): if(self.state["log"] == False): raise Exception("Data has to be transformed first") for key in self.X.keys(): if "Class" not in key: self.X[key] = self.X[key].apply(func_dict_out[key]) self.y[key] = self.y[key].apply(func_dict_out[key]) def cluster(self, species='Barite', n_clusters=2, x_length=50, y_length=50): if(self.state["log"] == False): raise Exception("Data has to be transformed first") class_labels = np.array([]) grid_length = x_length * y_length iterations = int(len(self.X) / grid_length) for i in range(0, iterations): field = np.array(self.X['Barite'][(i*grid_length):(i*grid_length+grid_length)] ).reshape(x_length, y_length) kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state).fit(field.reshape(-1, 1)) class_labels = np.append(class_labels.astype(int), kmeans.labels_) if ("Class" in self.X.columns and "Class" in self.y.columns): print("Class column already exists") else: class_labels_df = pd.DataFrame(class_labels, columns=['Class']) self.X = pd.concat([self.X, class_labels_df], axis=1) self.y = pd.concat([self.y, class_labels_df], axis=1) self.state["cluster"] = True