feat: cluster labels from R function

This commit is contained in:
straile 2024-11-02 17:37:15 +01:00
parent 81723f81f8
commit 1c4b949ce9
5 changed files with 27 additions and 155 deletions

View File

@ -253,7 +253,7 @@ The following variables and functions must be declared:
- `model_file_path` [*string*]: Path to the Keras model file with which - `model_file_path` [*string*]: Path to the Keras model file with which
the AI surrogate model is initialized. the AI surrogate model is initialized.
- `validate_predictions(predictors, prediction)` [*function*]: Returns a - `validate_predictions(predictors, prediction)` [*function*]: Must return a
boolean vector of length `nrow(predictions)`. The output of this function boolean vector of length `nrow(predictions)`. The output of this function
defines which predictions are considered valid and which are rejected. defines which predictions are considered valid and which are rejected.
the predictors and predictions are passed in their original original (not the predictors and predictions are passed in their original original (not
@ -279,16 +279,6 @@ of data from the front of the buffer. Defaults to the size of the Field.
should be used instead of the custom C++ implementation (Keras might be faster should be used instead of the custom C++ implementation (Keras might be faster
for larger models, especially on GPU). Defaults to false. for larger models, especially on GPU). Defaults to false.
- `use_k_means_clustering` [*bool*]: Decides if the K-Means clustering function
will be used to separate the field in a reactive and a non-reactive cluster.
Training and inference will be done with separate models for each cluster.
Defaults to false.
- `model_reactive_file_path` [*string*]: Path to the Keras model file with
which the AI surrogate model for the reactive cluster is initialized. If
ommitted, the models for both clusters will be initialized from
`model_file_path`
- `disable_training` [*bool*]: Deactivates the training functions. Defaults to - `disable_training` [*bool*]: Deactivates the training functions. Defaults to
false. false.
@ -303,11 +293,20 @@ is saved to this path as a .keras file.
Returns the scaled/transformed data frame. The default implementation uses no Returns the scaled/transformed data frame. The default implementation uses no
scaling or transformations. scaling or transformations.
- `postprocess(df)` [*function*]: - `postprocess(df)` [*function*]: Returns the rescaled/backtransformed data frame.
Returns the rescaled/backtransformed data frame. The combination of preprocess() The combination of preprocess() and postprocess() is expected to be idempotent.
and postprocess() is expected to be idempotent. The default implementation uses The default implementation uses no scaling or transformations.
no scaling or transformations.
- `assign_clusters(df)` [*function*]: Must return a vector of length
`nrow(predictions)` that contains cluster labels as 0/1. According to these
labels, two separate models will be used for inference and training. Cluster
assignemnts can e.g. be done for the reactive and non reactive parts of the
field.
- `model_reactive_file_path` [*string*]: Path to the Keras model file with
which the AI surrogate model for the reactive cluster is initialized. If
ommitted, the models for both clusters will be initialized from
`model_file_path`
```sh ```sh
cd <installation_dir>/bin cd <installation_dir>/bin

View File

@ -70,130 +70,6 @@ int Python_Keras_load_model(std::string model, std::string model_reactive, bool
return py_model_loaded; return py_model_loaded;
} }
/**
* @brief Calculates the euclidian distance between two points in n dimensional space
* @param a Point a
* @param b Point b
* @return The distance
*/
double distance(const std::vector<double>& a, const std::vector<double>& b) {
double sum = 0.0;
for (size_t i = 0; i < a.size(); ++i) {
sum += (a[i] - b[i]) * (a[i] - b[i]);
}
return sqrt(sum);
}
/**
* @brief Assigns all elements of a 2D-Matrix to the nearest cluster center point
* @param field 2D-Matrix with the content of a Field object
* @param clusters The vector of clusters represented by their center points
* @return A vector that contains the assigned cluster for each of the rows in field
*/
std::vector<int> assign_clusters(const std::vector<vector<double>>& field, const std::vector<vector<double>>& clusters) {
// Initiate a vector that holds the cluster labels of each row
std::vector<int> labels(field[0].size());
for (size_t row = 0; row < labels.size(); row++) {
// Get the coordinates of the current row
std::vector<double> row_data(field.size());
for (size_t column = 0; column < row_data.size(); column++) {
row_data[column] = field[column][row];
}
// Iterate over the clusters and check which cluster center is the closest
double current_min_distance = numeric_limits<double>::max();
int current_closest_cluster;
for (size_t cluster = 0; cluster < clusters.size(); cluster++) {
double cluster_distance = distance(row_data, clusters[cluster]);
if (cluster_distance < current_min_distance) {
current_min_distance = cluster_distance;
current_closest_cluster = cluster;
}
}
labels[row] = current_closest_cluster;
}
return labels;
}
/**
* @brief Calculates new center points for each given cluster by averaging the coordinates
* of all points that are assigen to it
* @param field 2D-Matrix with the content of a Field object
* @param labels The vector that contains the assigned cluster for each of the rows in field
* @param k The number of clusters
* @return The new cluster center points
*/
std::vector<vector<double>> calculate_new_clusters(const std::vector<std::vector<double>>& field,
const vector<int>& labels, int k) {
size_t columns = field.size();
size_t rows = field[0].size();
std::vector<std::vector<double>> clusters(k, std::vector<double>(columns, 0.0));
vector<int> count(k, 0);
// Sum the coordinates of all points that are assigned to each cluster
for (size_t row = 0; row < rows; row++) {
int assigned_cluster = labels[row];
for (size_t column = 0; column < columns; column++) {
clusters[assigned_cluster][column] += field[column][row];
}
count[assigned_cluster]++;
}
// Take the average of the summed coordinates
for (size_t cluster = 0; cluster < k; cluster++) {
if (count[cluster] == 0) continue;
for (size_t column = 0; column < columns; column++) {
clusters[cluster][column] /= count[cluster];
}
}
return clusters;
}
/**
* @brief Performs KMeans clustering for the elements of a 2D-Matrix
* @param field 2D-Matrix with the content of a Field object
* @param k The number of different clusters
* @param iterations The number of cluster update steps
* @return A vector that contains the assigned cluster for each of the rows in field
*/
std::vector<int> K_Means(std::vector<std::vector<double>>& field, int k, int iterations) {
// Initialize cluster centers by selecting random points from the field
srand(time(0));
std::vector<vector<double>> clusters;
for (size_t i = 0; i < k; ++i) {
std::vector<double> cluster_center(field.size());
int row = rand() % field.size();
for (size_t column = 0; column < cluster_center.size(); column++) {
cluster_center[column] = field[column][row];
}
clusters.push_back(cluster_center);
}
std::vector<int> labels;
for (size_t iter = 0; iter < iterations; ++iter) {
// Get the nearest cluster for each row
labels = assign_clusters(field, clusters);
// Update each cluster center as the average location of each point assigned to it
std::vector<vector<double>> new_clusters = calculate_new_clusters(field, labels, k);
clusters = new_clusters;
}
// Always define the reactive cluster as cluster 1
// Interprete the reactive cluster as the one on the origin of the field
// TODO: Is that always correct?
int reactive_cluster = labels[0];
if (reactive_cluster == 0) {
for (size_t i; i < labels.size(); i++) {
labels[i] = 1 - labels[i];
}
}
return labels;
}
/** /**
* @brief Converts the std::vector 2D matrix representation of a POET Field object to a numpy array * @brief Converts the std::vector 2D matrix representation of a POET Field object to a numpy array
* for use in the Python AI surrogate functions * for use in the Python AI surrogate functions
@ -622,7 +498,7 @@ void parallel_training(EigenModel* Eigen_model, EigenModel* Eigen_model_reactive
// If clustering is used, check the current cluster // If clustering is used, check the current cluster
int n_cluster_reactive = 0; int n_cluster_reactive = 0;
int train_cluster = -1; // Default value for non clustered training (all data is used) int train_cluster = -1; // Default value for non clustered training (all data is used)
if (params.use_k_means_clustering) { if (params.use_clustering) {
for (size_t i = 0; i < buffer_size; i++) { for (size_t i = 0; i < buffer_size; i++) {
n_cluster_reactive += training_data_buffer->cluster_labels[i]; n_cluster_reactive += training_data_buffer->cluster_labels[i];
} }
@ -643,7 +519,7 @@ void parallel_training(EigenModel* Eigen_model, EigenModel* Eigen_model_reactive
buffer_row); buffer_row);
} }
// Remove from cluster label buffer // Remove from cluster label buffer
if (params.use_k_means_clustering) { if (params.use_clustering) {
training_data_buffer->cluster_labels.erase( training_data_buffer->cluster_labels.erase(
training_data_buffer->cluster_labels.begin() + buffer_row); training_data_buffer->cluster_labels.begin() + buffer_row);
} }

View File

@ -61,8 +61,6 @@ void Python_finalize(std::mutex* Eigen_model_mutex, std::mutex* training_data_bu
int Python_Keras_load_model(std::string model, std::string model_reactive, int Python_Keras_load_model(std::string model, std::string model_reactive,
bool use_clustering); bool use_clustering);
std::vector<int> K_Means(std::vector<std::vector<double>>& field, int k, int maxIterations = 100);
std::vector<double> Python_Keras_predict(std::vector<std::vector<double>>& x, int batch_size, std::vector<double> Python_Keras_predict(std::vector<std::vector<double>>& x, int batch_size,
std::vector<int>& cluster_labels); std::vector<int>& cluster_labels);
@ -97,7 +95,6 @@ std::vector<double> Eigen_predict(const EigenModel& model, std::vector<std::vect
inline void Python_Keras_setup(std::string, std::string){} inline void Python_Keras_setup(std::string, std::string){}
inline void Python_finalize(std::mutex*, std::mutex*, std::condition_variable*, bool*, bool*){} inline void Python_finalize(std::mutex*, std::mutex*, std::condition_variable*, bool*, bool*){}
inline void Python_Keras_load_model(std::string, std::string, bool){} inline void Python_Keras_load_model(std::string, std::string, bool){}
inline std::vector<int> K_Means(std::vector<std::vector<double>>&, int, int) {return {};}
inline std::vector<double> Python_Keras_predict(std::vector<std::vector<double>>&, int, inline std::vector<double> Python_Keras_predict(std::vector<std::vector<double>>&, int,
std::vector<int>&){return {};} std::vector<int>&){return {};}
inline void training_data_buffer_append(std::vector<std::vector<double>>&, inline void training_data_buffer_append(std::vector<std::vector<double>>&,

View File

@ -308,7 +308,7 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, const RuntimeParameters &params,
// Initiate two models from one file // Initiate two models from one file
Python_Keras_load_model(R["model_file_path"], R["model_reactive_file_path"], Python_Keras_load_model(R["model_file_path"], R["model_reactive_file_path"],
params.use_k_means_clustering); params.use_clustering);
if (!params.disable_training) { if (!params.disable_training) {
MSG("AI: Initialize training thread"); MSG("AI: Initialize training thread");
Python_Keras_training_thread(&Eigen_model, &Eigen_model_reactive, Python_Keras_training_thread(&Eigen_model, &Eigen_model_reactive,
@ -335,7 +335,7 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, const RuntimeParameters &params,
} }
// Set initial model weights // Set initial model weights
update_weights(&Eigen_model, cpp_weights); update_weights(&Eigen_model, cpp_weights);
if (params.use_k_means_clustering) { if (params.use_clustering) {
// Initialize Eigen model for reactive part of the field // Initialize Eigen model for reactive part of the field
cpp_weights = Python_Keras_get_weights("model_reactive"); cpp_weights = Python_Keras_get_weights("model_reactive");
num_layers = cpp_weights.size() / 2; num_layers = cpp_weights.size() / 2;
@ -391,16 +391,16 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, const RuntimeParameters &params,
std::vector<std::vector<double>> predictors_scaled = R["predictors_scaled"]; std::vector<std::vector<double>> predictors_scaled = R["predictors_scaled"];
// Get K-Means cluster assignements based on the preprocessed data // Get K-Means cluster assignements based on the preprocessed data
if (params.use_k_means_clustering) { if (params.use_clustering) {
cluster_labels = K_Means(predictors_scaled, 2, 300); R.parseEval("cluster_labels <- assign_clusters(predictors_scaled)");
R["cluster_labels"] = cluster_labels; cluster_labels = Rcpp::as<std::vector<int>>(R["cluster_labels"]);
} }
MSG("AI: Predict"); MSG("AI: Predict");
if (params.use_Keras_predictions) { // Predict with Keras default function if (params.use_Keras_predictions) { // Predict with Keras default function
R["TMP"] = Python_Keras_predict(predictors_scaled, params.batch_size, cluster_labels); R["TMP"] = Python_Keras_predict(predictors_scaled, params.batch_size, cluster_labels);
} else { // Predict with custom Eigen function } else { // Predict with custom Eigen function
if (params.use_k_means_clustering) { if (params.use_clustering) {
R["TMP"] = Eigen_predict_clustered(Eigen_model, Eigen_model_reactive, R["TMP"] = Eigen_predict_clustered(Eigen_model, Eigen_model_reactive,
predictors_scaled, params.batch_size, predictors_scaled, params.batch_size,
&Eigen_model_mutex, cluster_labels); &Eigen_model_mutex, cluster_labels);
@ -475,7 +475,7 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, const RuntimeParameters &params,
// count buffer size according to the cluster assignements // count buffer size according to the cluster assignements
int n_cluster_reactive = 0; int n_cluster_reactive = 0;
size_t buffer_size = training_data_buffer.x[0].size(); size_t buffer_size = training_data_buffer.x[0].size();
if (params.use_k_means_clustering) { if (params.use_clustering) {
cluster_labels_append(training_data_buffer.cluster_labels, cluster_labels, cluster_labels_append(training_data_buffer.cluster_labels, cluster_labels,
R["validity_vector"]); R["validity_vector"]);
for (size_t i = 0; i < buffer_size; i++) { for (size_t i = 0; i < buffer_size; i++) {
@ -714,9 +714,9 @@ int main(int argc, char *argv[]) {
run_params.save_model_path = Rcpp::as<std::string>(R["save_model_path"]); run_params.save_model_path = Rcpp::as<std::string>(R["save_model_path"]);
MSG("AI: Model will be saved as \"" + run_params.save_model_path + "\""); MSG("AI: Model will be saved as \"" + run_params.save_model_path + "\"");
} }
if (Rcpp::as<bool>(R.parseEval("exists(\"use_k_means_clustering\")"))) { if (Rcpp::as<bool>(R.parseEval("exists(\"assign_clusters\")"))) {
run_params.use_k_means_clustering = R["use_k_means_clustering"]; run_params.use_clustering = true;
MSG("K-Means clustering will be used for the AI surrogate") MSG("Clustering will be used for the AI surrogate")
} }
if (Rcpp::as<bool>(R.parseEval("exists(\"train_only_invalid\")"))) { if (Rcpp::as<bool>(R.parseEval("exists(\"train_only_invalid\")"))) {
run_params.train_only_invalid = R["train_only_invalid"]; run_params.train_only_invalid = R["train_only_invalid"];

View File

@ -72,7 +72,7 @@ struct RuntimeParameters {
/*AI surriogate configuration*/ /*AI surriogate configuration*/
bool use_ai_surrogate = false; // Can be set with command line flag ---ai-surrogate bool use_ai_surrogate = false; // Can be set with command line flag ---ai-surrogate
bool disable_training = false; // Can be set in the R input script bool disable_training = false; // Can be set in the R input script
bool use_k_means_clustering = false; // Can be set in the R input script bool use_clustering = false; // Can be set in the R input script
bool use_Keras_predictions = false; // Can be set in the R input script bool use_Keras_predictions = false; // Can be set in the R input script
bool train_only_invalid = false; // Can be set in the R input script bool train_only_invalid = false; // Can be set in the R input script
int batch_size = 2560; // default value determined in test on the UP Turing cluster int batch_size = 2560; // default value determined in test on the UP Turing cluster