From a3a643577ad95c023ed478efef02df1755935dc6 Mon Sep 17 00:00:00 2001 From: hans Date: Thu, 30 May 2024 13:20:50 +0200 Subject: [PATCH] docs: updated docs for v03 --- README.md | 19 + bench/barite/README.org | 9 + docs/0230720_Scheme_POET_en.svg | 602 +++++++++++++++++++++++++++++ docs/20230720_Scheme_POET_en.svg | 637 ------------------------------- src/Init/ChemistryInit.cpp | 19 +- src/poet.cpp | 11 +- 6 files changed, 655 insertions(+), 642 deletions(-) create mode 100644 docs/0230720_Scheme_POET_en.svg delete mode 100644 docs/20230720_Scheme_POET_en.svg diff --git a/README.md b/README.md index e1e9f6b03..1c062244b 100644 --- a/README.md +++ b/README.md @@ -154,6 +154,7 @@ The following parameters can be set: |-----------------------------|--------------|--------------------------------------------------------------------------------------------------------------------------| | **--work-package-size=** | _1..n_ | size of work packages (defaults to _5_) | | **-P, --progress** | | show progress bar | +| **--ai-surrogate** | | activates the AI surrogate chemistry model (defaults to _OFF_) | | **--dht** | | enabling DHT usage (defaults to _OFF_) | | **--dht-strategy=** | _0-1_ | change DHT strategy. **NOT IMPLEMENTED YET** (Defaults to _0_) | | **--dht-size=** | _1-n_ | size of DHT per process involved in megabyte (defaults to _1000 MByte_) | @@ -231,3 +232,21 @@ important information from the OpenMPI Man Page: For example, on platforms that support it, the clock_gettime() function will be used to obtain a monotonic clock value with whatever precision is supported on that platform (e.g., nanoseconds). + +## Additional functions for the AI surrogate + +The AI surrogate can be activated for any benchmark and is by default initiated as a sequential keras model with three hidden layer of depth 48, 96, 24 with relu activation and adam optimizer. All functions in `ai_surrogate_model.R` can be overridden by adding custom definitions via an R file in the input script. +This is done by adding the path to this file in the input script. Simply add the path as an element called `ai_surrogate_input_script` to the `chemistry_setup` list. +Please use the global variable `ai_surrogate_base_path` as a base path when relative filepaths are used in custom funtions. + +**There is currently no default implementation to determine the validity of predicted values.** This means, that every input script must include an R source file with a custom function `validate_predictions(predictors, prediction)`. Examples for custom functions can be found for the barite_200 benchmark + +The functions can be defined as follows: + +`validate_predictions(predictors, prediction)`: Returns a boolean index vector that signals for each row in the predictions if the values are considered valid. Can eg. be implemented as a mass balance threshold between the predictors and the prediction. + +`initiate_model()`: Returns a keras model. Can be used to load pretrained models. + +`preprocess(df, backtransform = FALSE, outputs = FALSE)`: Returns the scaled/transformed/backtransformed dataframe. The `backtransform` flag signals if the current processing step is applied to data that's assumed to be scaled and expects backtransformed values. The `outputs` flag signals if the current processing step is applied to the output or tatget of the model. This can be used to eg. skip these processing steps and only scale the model input. + +`training_step (model, predictor, target, validity)`: Trains the model after each iteration. `validity` is the bool index vector given by `validate_predictions` and can eg. be used to only train on values that have not been valid predictions. \ No newline at end of file diff --git a/bench/barite/README.org b/bench/barite/README.org index 86239c5de..508dbfc31 100644 --- a/bench/barite/README.org +++ b/bench/barite/README.org @@ -22,6 +22,15 @@ mpirun -np 4 ./poet --interp barite_interp_eval.R barite_results grid - =barite_200.R=: POET input script for a 200x200 simulation grid +- =barite_200ai_surrogate_input_script.R=: Defines the ai surrogate functions + to load a pretrained model and apply min-max-feature scaling on the model inputs + and target. Prediction validity is assessed with a threshold of 3e-5 on the mass + balance of Ba and Sr. +- =barite_200min_max_bounds=: Minimum and maximum values from 50 iterations of the + barite_200 benchmark. Used for feature scaling in the ai surrogate. +- =barite_200model_min_max.keras=: A sequential keras model that has been trained + on 50 iterations of the barite_200 benchmark with min-max-scaled inputs + and targets/outputs. - =db_barite.dat=: PHREEQC database containing the kinetic expressions for barite and celestite, stripped down from =phreeqc.dat= - =barite.pqi=: PHREEQC input script defining the chemical system diff --git a/docs/0230720_Scheme_POET_en.svg b/docs/0230720_Scheme_POET_en.svg new file mode 100644 index 000000000..4893de98c --- /dev/null +++ b/docs/0230720_Scheme_POET_en.svg @@ -0,0 +1,602 @@ + + + +tugFlowTransportPOETChemistryUpdateMasterIn DHT?RetrieveResultPHREEQCStore in +DHTYesWorkerAI Surrogate invalid?AI SurrogateDistributeWorkGather ResultsNext Iteration diff --git a/docs/20230720_Scheme_POET_en.svg b/docs/20230720_Scheme_POET_en.svg deleted file mode 100644 index dd29d9b9e..000000000 --- a/docs/20230720_Scheme_POET_en.svg +++ /dev/null @@ -1,637 +0,0 @@ - - - -tugFlowTransportPOETChemistryIn DHT?RetrieveResultPHREEQCStore in DHTAI SurrogateAccurate?YesNoUpdateNoYesWorkerDistribute WorkGather ResultsMasterNext Iteration diff --git a/src/Init/ChemistryInit.cpp b/src/Init/ChemistryInit.cpp index 0e512023e..90631fd01 100644 --- a/src/Init/ChemistryInit.cpp +++ b/src/Init/ChemistryInit.cpp @@ -34,7 +34,24 @@ void InitialList::initChemistry(const Rcpp::List &chem) { if (chem.containsElementNamed("ai_surrogate_input_script")) { std::string ai_surrogate_input_script_path = chem["ai_surrogate_input_script"]; - this->ai_surrogate_input_script = Rcpp::as(Rcpp::Function("normalizePath")(Rcpp::wrap(ai_surrogate_input_script_path))); + ai_surrogate_input_script_path = Rcpp::as(Rcpp::Function("normalizePath")(Rcpp::wrap(ai_surrogate_input_script_path))); + + // Copying the entire script for the init file + std::ifstream file(ai_surrogate_input_script_path); + if (!file.is_open()) { + // print error message and return + Rcpp::Rcerr << "AI surroghate input script was not found at: " << ai_surrogate_input_script_path << std::endl; + } + + std::stringstream buffer; + buffer << file.rdbuf(); + std::string fileContent = buffer.str(); + file.close(); + + // Add the filepath as a global variable in R to enable relative filepaths in the R script + fileContent += "\nai_surrogate_base_path <- \"" + ai_surrogate_input_script_path + "\""; + + this->ai_surrogate_input_script = fileContent; } this->field_header = diff --git a/src/poet.cpp b/src/poet.cpp index 5ee6260fc..5322f3b17 100644 --- a/src/poet.cpp +++ b/src/poet.cpp @@ -462,11 +462,14 @@ int main(int argc, char *argv[]) { R["ai_surrogate_species"] = init_list.getChemistryInit().dht_species.getNames(); R["out_dir"] = run_params.out_dir; - const std::string ai_surrogate_input_script_path = init_list.getChemistryInit().ai_surrogate_input_script; + const std::string ai_surrogate_input_script = init_list.getChemistryInit().ai_surrogate_input_script; - if (!ai_surrogate_input_script_path.empty()) { - R["ai_surrogate_base_path"] = ai_surrogate_input_script_path.substr(0, ai_surrogate_input_script_path.find_last_of('/') + 1); - R.parseEvalQ("source('" + ai_surrogate_input_script_path + "')"); + if (!ai_surrogate_input_script.empty()) { + /* Incorporate user defined ai surrogate input script */ + R.parseEvalQ(ai_surrogate_input_script); + + std::string ai_surrogate_base_path = R["ai_surrogate_base_path"]; + R["ai_surrogate_base_path"] = ai_surrogate_base_path.substr(0, ai_surrogate_base_path.find_last_of('/') + 1); } R.parseEval("model <- initiate_model()"); R.parseEval("gpu_info()");