mirror of
https://git.gfz-potsdam.de/naaice/poet.git
synced 2025-12-15 20:38:23 +01:00
feat: cluster labels from R function
This commit is contained in:
parent
81723f81f8
commit
1c4b949ce9
29
README.md
29
README.md
@ -253,7 +253,7 @@ The following variables and functions must be declared:
|
||||
- `model_file_path` [*string*]: Path to the Keras model file with which
|
||||
the AI surrogate model is initialized.
|
||||
|
||||
- `validate_predictions(predictors, prediction)` [*function*]: Returns a
|
||||
- `validate_predictions(predictors, prediction)` [*function*]: Must return a
|
||||
boolean vector of length `nrow(predictions)`. The output of this function
|
||||
defines which predictions are considered valid and which are rejected.
|
||||
the predictors and predictions are passed in their original original (not
|
||||
@ -279,16 +279,6 @@ of data from the front of the buffer. Defaults to the size of the Field.
|
||||
should be used instead of the custom C++ implementation (Keras might be faster
|
||||
for larger models, especially on GPU). Defaults to false.
|
||||
|
||||
- `use_k_means_clustering` [*bool*]: Decides if the K-Means clustering function
|
||||
will be used to separate the field in a reactive and a non-reactive cluster.
|
||||
Training and inference will be done with separate models for each cluster.
|
||||
Defaults to false.
|
||||
|
||||
- `model_reactive_file_path` [*string*]: Path to the Keras model file with
|
||||
which the AI surrogate model for the reactive cluster is initialized. If
|
||||
ommitted, the models for both clusters will be initialized from
|
||||
`model_file_path`
|
||||
|
||||
- `disable_training` [*bool*]: Deactivates the training functions. Defaults to
|
||||
false.
|
||||
|
||||
@ -303,11 +293,20 @@ is saved to this path as a .keras file.
|
||||
Returns the scaled/transformed data frame. The default implementation uses no
|
||||
scaling or transformations.
|
||||
|
||||
- `postprocess(df)` [*function*]:
|
||||
Returns the rescaled/backtransformed data frame. The combination of preprocess()
|
||||
and postprocess() is expected to be idempotent. The default implementation uses
|
||||
no scaling or transformations.
|
||||
- `postprocess(df)` [*function*]: Returns the rescaled/backtransformed data frame.
|
||||
The combination of preprocess() and postprocess() is expected to be idempotent.
|
||||
The default implementation uses no scaling or transformations.
|
||||
|
||||
- `assign_clusters(df)` [*function*]: Must return a vector of length
|
||||
`nrow(predictions)` that contains cluster labels as 0/1. According to these
|
||||
labels, two separate models will be used for inference and training. Cluster
|
||||
assignemnts can e.g. be done for the reactive and non reactive parts of the
|
||||
field.
|
||||
|
||||
- `model_reactive_file_path` [*string*]: Path to the Keras model file with
|
||||
which the AI surrogate model for the reactive cluster is initialized. If
|
||||
ommitted, the models for both clusters will be initialized from
|
||||
`model_file_path`
|
||||
|
||||
```sh
|
||||
cd <installation_dir>/bin
|
||||
|
||||
@ -70,130 +70,6 @@ int Python_Keras_load_model(std::string model, std::string model_reactive, bool
|
||||
return py_model_loaded;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Calculates the euclidian distance between two points in n dimensional space
|
||||
* @param a Point a
|
||||
* @param b Point b
|
||||
* @return The distance
|
||||
*/
|
||||
double distance(const std::vector<double>& a, const std::vector<double>& b) {
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < a.size(); ++i) {
|
||||
sum += (a[i] - b[i]) * (a[i] - b[i]);
|
||||
}
|
||||
return sqrt(sum);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Assigns all elements of a 2D-Matrix to the nearest cluster center point
|
||||
* @param field 2D-Matrix with the content of a Field object
|
||||
* @param clusters The vector of clusters represented by their center points
|
||||
* @return A vector that contains the assigned cluster for each of the rows in field
|
||||
*/
|
||||
std::vector<int> assign_clusters(const std::vector<vector<double>>& field, const std::vector<vector<double>>& clusters) {
|
||||
// Initiate a vector that holds the cluster labels of each row
|
||||
std::vector<int> labels(field[0].size());
|
||||
|
||||
for (size_t row = 0; row < labels.size(); row++) {
|
||||
// Get the coordinates of the current row
|
||||
std::vector<double> row_data(field.size());
|
||||
for (size_t column = 0; column < row_data.size(); column++) {
|
||||
row_data[column] = field[column][row];
|
||||
}
|
||||
// Iterate over the clusters and check which cluster center is the closest
|
||||
double current_min_distance = numeric_limits<double>::max();
|
||||
int current_closest_cluster;
|
||||
for (size_t cluster = 0; cluster < clusters.size(); cluster++) {
|
||||
double cluster_distance = distance(row_data, clusters[cluster]);
|
||||
if (cluster_distance < current_min_distance) {
|
||||
current_min_distance = cluster_distance;
|
||||
current_closest_cluster = cluster;
|
||||
}
|
||||
}
|
||||
labels[row] = current_closest_cluster;
|
||||
}
|
||||
return labels;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculates new center points for each given cluster by averaging the coordinates
|
||||
* of all points that are assigen to it
|
||||
* @param field 2D-Matrix with the content of a Field object
|
||||
* @param labels The vector that contains the assigned cluster for each of the rows in field
|
||||
* @param k The number of clusters
|
||||
* @return The new cluster center points
|
||||
*/
|
||||
std::vector<vector<double>> calculate_new_clusters(const std::vector<std::vector<double>>& field,
|
||||
const vector<int>& labels, int k) {
|
||||
size_t columns = field.size();
|
||||
size_t rows = field[0].size();
|
||||
std::vector<std::vector<double>> clusters(k, std::vector<double>(columns, 0.0));
|
||||
vector<int> count(k, 0);
|
||||
|
||||
// Sum the coordinates of all points that are assigned to each cluster
|
||||
for (size_t row = 0; row < rows; row++) {
|
||||
int assigned_cluster = labels[row];
|
||||
for (size_t column = 0; column < columns; column++) {
|
||||
clusters[assigned_cluster][column] += field[column][row];
|
||||
}
|
||||
count[assigned_cluster]++;
|
||||
}
|
||||
|
||||
// Take the average of the summed coordinates
|
||||
for (size_t cluster = 0; cluster < k; cluster++) {
|
||||
if (count[cluster] == 0) continue;
|
||||
for (size_t column = 0; column < columns; column++) {
|
||||
clusters[cluster][column] /= count[cluster];
|
||||
}
|
||||
}
|
||||
return clusters;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs KMeans clustering for the elements of a 2D-Matrix
|
||||
* @param field 2D-Matrix with the content of a Field object
|
||||
* @param k The number of different clusters
|
||||
* @param iterations The number of cluster update steps
|
||||
* @return A vector that contains the assigned cluster for each of the rows in field
|
||||
*/
|
||||
std::vector<int> K_Means(std::vector<std::vector<double>>& field, int k, int iterations) {
|
||||
// Initialize cluster centers by selecting random points from the field
|
||||
srand(time(0));
|
||||
std::vector<vector<double>> clusters;
|
||||
for (size_t i = 0; i < k; ++i) {
|
||||
std::vector<double> cluster_center(field.size());
|
||||
int row = rand() % field.size();
|
||||
for (size_t column = 0; column < cluster_center.size(); column++) {
|
||||
cluster_center[column] = field[column][row];
|
||||
}
|
||||
clusters.push_back(cluster_center);
|
||||
}
|
||||
|
||||
std::vector<int> labels;
|
||||
|
||||
for (size_t iter = 0; iter < iterations; ++iter) {
|
||||
// Get the nearest cluster for each row
|
||||
labels = assign_clusters(field, clusters);
|
||||
// Update each cluster center as the average location of each point assigned to it
|
||||
std::vector<vector<double>> new_clusters = calculate_new_clusters(field, labels, k);
|
||||
clusters = new_clusters;
|
||||
}
|
||||
|
||||
|
||||
// Always define the reactive cluster as cluster 1
|
||||
// Interprete the reactive cluster as the one on the origin of the field
|
||||
// TODO: Is that always correct?
|
||||
int reactive_cluster = labels[0];
|
||||
if (reactive_cluster == 0) {
|
||||
for (size_t i; i < labels.size(); i++) {
|
||||
labels[i] = 1 - labels[i];
|
||||
}
|
||||
}
|
||||
return labels;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Converts the std::vector 2D matrix representation of a POET Field object to a numpy array
|
||||
* for use in the Python AI surrogate functions
|
||||
@ -622,7 +498,7 @@ void parallel_training(EigenModel* Eigen_model, EigenModel* Eigen_model_reactive
|
||||
// If clustering is used, check the current cluster
|
||||
int n_cluster_reactive = 0;
|
||||
int train_cluster = -1; // Default value for non clustered training (all data is used)
|
||||
if (params.use_k_means_clustering) {
|
||||
if (params.use_clustering) {
|
||||
for (size_t i = 0; i < buffer_size; i++) {
|
||||
n_cluster_reactive += training_data_buffer->cluster_labels[i];
|
||||
}
|
||||
@ -643,7 +519,7 @@ void parallel_training(EigenModel* Eigen_model, EigenModel* Eigen_model_reactive
|
||||
buffer_row);
|
||||
}
|
||||
// Remove from cluster label buffer
|
||||
if (params.use_k_means_clustering) {
|
||||
if (params.use_clustering) {
|
||||
training_data_buffer->cluster_labels.erase(
|
||||
training_data_buffer->cluster_labels.begin() + buffer_row);
|
||||
}
|
||||
|
||||
@ -61,8 +61,6 @@ void Python_finalize(std::mutex* Eigen_model_mutex, std::mutex* training_data_bu
|
||||
int Python_Keras_load_model(std::string model, std::string model_reactive,
|
||||
bool use_clustering);
|
||||
|
||||
std::vector<int> K_Means(std::vector<std::vector<double>>& field, int k, int maxIterations = 100);
|
||||
|
||||
std::vector<double> Python_Keras_predict(std::vector<std::vector<double>>& x, int batch_size,
|
||||
std::vector<int>& cluster_labels);
|
||||
|
||||
@ -97,7 +95,6 @@ std::vector<double> Eigen_predict(const EigenModel& model, std::vector<std::vect
|
||||
inline void Python_Keras_setup(std::string, std::string){}
|
||||
inline void Python_finalize(std::mutex*, std::mutex*, std::condition_variable*, bool*, bool*){}
|
||||
inline void Python_Keras_load_model(std::string, std::string, bool){}
|
||||
inline std::vector<int> K_Means(std::vector<std::vector<double>>&, int, int) {return {};}
|
||||
inline std::vector<double> Python_Keras_predict(std::vector<std::vector<double>>&, int,
|
||||
std::vector<int>&){return {};}
|
||||
inline void training_data_buffer_append(std::vector<std::vector<double>>&,
|
||||
|
||||
20
src/poet.cpp
20
src/poet.cpp
@ -308,7 +308,7 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, const RuntimeParameters ¶ms,
|
||||
|
||||
// Initiate two models from one file
|
||||
Python_Keras_load_model(R["model_file_path"], R["model_reactive_file_path"],
|
||||
params.use_k_means_clustering);
|
||||
params.use_clustering);
|
||||
if (!params.disable_training) {
|
||||
MSG("AI: Initialize training thread");
|
||||
Python_Keras_training_thread(&Eigen_model, &Eigen_model_reactive,
|
||||
@ -335,7 +335,7 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, const RuntimeParameters ¶ms,
|
||||
}
|
||||
// Set initial model weights
|
||||
update_weights(&Eigen_model, cpp_weights);
|
||||
if (params.use_k_means_clustering) {
|
||||
if (params.use_clustering) {
|
||||
// Initialize Eigen model for reactive part of the field
|
||||
cpp_weights = Python_Keras_get_weights("model_reactive");
|
||||
num_layers = cpp_weights.size() / 2;
|
||||
@ -391,16 +391,16 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, const RuntimeParameters ¶ms,
|
||||
std::vector<std::vector<double>> predictors_scaled = R["predictors_scaled"];
|
||||
|
||||
// Get K-Means cluster assignements based on the preprocessed data
|
||||
if (params.use_k_means_clustering) {
|
||||
cluster_labels = K_Means(predictors_scaled, 2, 300);
|
||||
R["cluster_labels"] = cluster_labels;
|
||||
if (params.use_clustering) {
|
||||
R.parseEval("cluster_labels <- assign_clusters(predictors_scaled)");
|
||||
cluster_labels = Rcpp::as<std::vector<int>>(R["cluster_labels"]);
|
||||
}
|
||||
|
||||
MSG("AI: Predict");
|
||||
if (params.use_Keras_predictions) { // Predict with Keras default function
|
||||
R["TMP"] = Python_Keras_predict(predictors_scaled, params.batch_size, cluster_labels);
|
||||
} else { // Predict with custom Eigen function
|
||||
if (params.use_k_means_clustering) {
|
||||
if (params.use_clustering) {
|
||||
R["TMP"] = Eigen_predict_clustered(Eigen_model, Eigen_model_reactive,
|
||||
predictors_scaled, params.batch_size,
|
||||
&Eigen_model_mutex, cluster_labels);
|
||||
@ -475,7 +475,7 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, const RuntimeParameters ¶ms,
|
||||
// count buffer size according to the cluster assignements
|
||||
int n_cluster_reactive = 0;
|
||||
size_t buffer_size = training_data_buffer.x[0].size();
|
||||
if (params.use_k_means_clustering) {
|
||||
if (params.use_clustering) {
|
||||
cluster_labels_append(training_data_buffer.cluster_labels, cluster_labels,
|
||||
R["validity_vector"]);
|
||||
for (size_t i = 0; i < buffer_size; i++) {
|
||||
@ -714,9 +714,9 @@ int main(int argc, char *argv[]) {
|
||||
run_params.save_model_path = Rcpp::as<std::string>(R["save_model_path"]);
|
||||
MSG("AI: Model will be saved as \"" + run_params.save_model_path + "\"");
|
||||
}
|
||||
if (Rcpp::as<bool>(R.parseEval("exists(\"use_k_means_clustering\")"))) {
|
||||
run_params.use_k_means_clustering = R["use_k_means_clustering"];
|
||||
MSG("K-Means clustering will be used for the AI surrogate")
|
||||
if (Rcpp::as<bool>(R.parseEval("exists(\"assign_clusters\")"))) {
|
||||
run_params.use_clustering = true;
|
||||
MSG("Clustering will be used for the AI surrogate")
|
||||
}
|
||||
if (Rcpp::as<bool>(R.parseEval("exists(\"train_only_invalid\")"))) {
|
||||
run_params.train_only_invalid = R["train_only_invalid"];
|
||||
|
||||
@ -72,7 +72,7 @@ struct RuntimeParameters {
|
||||
/*AI surriogate configuration*/
|
||||
bool use_ai_surrogate = false; // Can be set with command line flag ---ai-surrogate
|
||||
bool disable_training = false; // Can be set in the R input script
|
||||
bool use_k_means_clustering = false; // Can be set in the R input script
|
||||
bool use_clustering = false; // Can be set in the R input script
|
||||
bool use_Keras_predictions = false; // Can be set in the R input script
|
||||
bool train_only_invalid = false; // Can be set in the R input script
|
||||
int batch_size = 2560; // default value determined in test on the UP Turing cluster
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user