mirror of
https://git.gfz-potsdam.de/naaice/model-training.git
synced 2025-12-13 12:18:22 +01:00
update preprocessing
This commit is contained in:
parent
78a57654a3
commit
67b1f23fa3
File diff suppressed because one or more lines are too long
@ -14,6 +14,7 @@ from imblearn.over_sampling import RandomOverSampler
|
|||||||
from collections import Counter
|
from collections import Counter
|
||||||
import os
|
import os
|
||||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
||||||
|
from sklearn.base import clone
|
||||||
|
|
||||||
# preprocessing pipeline
|
# preprocessing pipeline
|
||||||
#
|
#
|
||||||
@ -86,7 +87,7 @@ def clustering(X, n_clusters=2, random_state=42, x_length=50, y_length=50):
|
|||||||
|
|
||||||
class_labels = np.append(class_labels.astype(int), kmeans.labels_)
|
class_labels = np.append(class_labels.astype(int), kmeans.labels_)
|
||||||
|
|
||||||
if("Class" in X.columns and "Class" in X.columns):
|
if("Class" in X.columns):
|
||||||
print("Class column already exists")
|
print("Class column already exists")
|
||||||
else:
|
else:
|
||||||
class_labels_df = pd.DataFrame(class_labels, columns=['Class'])
|
class_labels_df = pd.DataFrame(class_labels, columns=['Class'])
|
||||||
@ -126,7 +127,7 @@ def balancer(design, target, strategy, sample_fraction=0.5):
|
|||||||
df_resampled, classes_resampled = under.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.columns == "Class"])
|
df_resampled, classes_resampled = under.fit_resample(df.loc[:, df.columns != "Class"], df.loc[:, df.columns == "Class"])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
classes_resampled = classes
|
return design, target
|
||||||
|
|
||||||
counter = classes_resampled["Class"].value_counts()
|
counter = classes_resampled["Class"].value_counts()
|
||||||
print("Amount class 0 after:", counter[0] / (counter[0] + counter[1]) )
|
print("Amount class 0 after:", counter[0] / (counter[0] + counter[1]) )
|
||||||
@ -152,9 +153,10 @@ def plot_simulation(X, timestep, component='Barite', x_length=50, y_length=50):
|
|||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
def preprocessing(df_design, df_targets, func_dict_in, func_dict_out, sampling, test_size):
|
def preprocessing_training(df_design, df_targets, func_dict_in, func_dict_out, sampling, scaling, test_size):
|
||||||
|
|
||||||
df_design = clustering(df_design)
|
df_design = clustering(df_design)
|
||||||
|
df_targets = pd.concat([df_targets, df_design['Class']], axis=1)
|
||||||
|
|
||||||
df_design_log = FuncTransform(func_dict_in, func_dict_out).fit_transform(df_design)
|
df_design_log = FuncTransform(func_dict_in, func_dict_out).fit_transform(df_design)
|
||||||
df_results_log = FuncTransform(func_dict_in, func_dict_out).fit_transform(df_targets)
|
df_results_log = FuncTransform(func_dict_in, func_dict_out).fit_transform(df_targets)
|
||||||
@ -166,14 +168,72 @@ def preprocessing(df_design, df_targets, func_dict_in, func_dict_out, sampling,
|
|||||||
scaler_X = MinMaxScaler()
|
scaler_X = MinMaxScaler()
|
||||||
scaler_y = MinMaxScaler()
|
scaler_y = MinMaxScaler()
|
||||||
|
|
||||||
X_train = scaler_X.fit_transform(X_train)
|
if scaling == 'individual':
|
||||||
X_test = scaler_X.transform(X_test)
|
scaler_X.fit(X_train.iloc[:, X_train.columns != "Class"])
|
||||||
|
scaler_y.fit(y_train.iloc[:, y_train.columns != "Class"])
|
||||||
|
|
||||||
|
elif scaling == 'global':
|
||||||
|
scaler_X.fit(pd.concat([X_train.iloc[:, X_train.columns != "Class"], y_train.iloc[:, y_train.columns != "Class"]], axis=0))
|
||||||
|
scaler_y = clone(scaler_X)
|
||||||
|
|
||||||
|
X_train = pd.concat([scaler_X.transform(X_train.loc[:, X_train.columns != "Class"]), X_train.loc[:, "Class"]], axis=1)
|
||||||
|
X_test = pd.concat([scaler_X.transform(X_test.loc[:, X_test.columns != "Class"]), X_test.loc[:, "Class"]], axis=1)
|
||||||
|
|
||||||
y_train = scaler_y.fit_transform(y_train)
|
y_train = pd.concat([scaler_y.transform(y_train.loc[:, y_train.columns != "Class"]), y_train.loc[:, "Class"]], axis=1)
|
||||||
y_test = scaler_y.transform(y_test)
|
y_test = pd.concat([scaler_y.transform(y_test.loc[:, y_test.columns != "Class"]), y_test.loc[:, "Class"]], axis=1)
|
||||||
|
|
||||||
X_train, X_val, y_train, y_val = sk.train_test_split(X_train, y_train, test_size = 0.1)
|
X_train, X_val, y_train, y_val = sk.train_test_split(X_train, y_train, test_size = 0.1)
|
||||||
|
|
||||||
return X_train, X_val, X_test, y_train, y_val, y_test
|
return X_train, X_val, X_test, y_train, y_val, y_test, scaler_X, scaler_y
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class preprocessing:
|
||||||
|
|
||||||
|
def __init__(self, df_design, df_targets, random_state=42):
|
||||||
|
self.X = df_design
|
||||||
|
self.y = df_targets
|
||||||
|
self.random_state = random_state
|
||||||
|
self.state = {"cluster": False, "log": False, "balance": False, "scale": False}
|
||||||
|
|
||||||
|
def funcTranform(self, func_dict_in):
|
||||||
|
for key in self.X.keys():
|
||||||
|
if "Class" not in key:
|
||||||
|
self.X[key] = self.X[key].apply(func_dict_in[key])
|
||||||
|
self.y[key] = self.y[key].apply(func_dict_in[key])
|
||||||
|
self.state["log"] = True
|
||||||
|
|
||||||
|
def funcInverse(self, func_dict_out):
|
||||||
|
|
||||||
|
if(self.state["log"] == False):
|
||||||
|
raise Exception("Data has to be transformed first")
|
||||||
|
for key in self.X.keys():
|
||||||
|
if "Class" not in key:
|
||||||
|
self.X[key] = self.X[key].apply(func_dict_out[key])
|
||||||
|
self.y[key] = self.y[key].apply(func_dict_out[key])
|
||||||
|
|
||||||
|
def cluster(self, species='Barite', n_clusters=2, x_length=50, y_length=50):
|
||||||
|
|
||||||
|
if(self.state["log"] == False):
|
||||||
|
raise Exception("Data has to be transformed first")
|
||||||
|
class_labels = np.array([])
|
||||||
|
grid_length = x_length * y_length
|
||||||
|
iterations = int(len(self.X) / grid_length)
|
||||||
|
|
||||||
|
for i in range(0, iterations):
|
||||||
|
field = np.array(self.X['Barite'][(i*grid_length):(i*grid_length+grid_length)]
|
||||||
|
).reshape(x_length, y_length)
|
||||||
|
kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state).fit(field.reshape(-1, 1))
|
||||||
|
|
||||||
|
class_labels = np.append(class_labels.astype(int), kmeans.labels_)
|
||||||
|
|
||||||
|
if ("Class" in self.X.columns and "Class" in self.y.columns):
|
||||||
|
print("Class column already exists")
|
||||||
|
else:
|
||||||
|
class_labels_df = pd.DataFrame(class_labels, columns=['Class'])
|
||||||
|
self.X = pd.concat([self.X, class_labels_df], axis=1)
|
||||||
|
self.y = pd.concat([self.y, class_labels_df], axis=1)
|
||||||
|
self.state["cluster"] = True
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user