feat: fast serialization/storage using qs package via --qs flag

2025-12-15 20:38:23 +01:00 · 2024-06-11 16:50:02 +02:00 · 2024-06-11 16:50:02 +02:00 · 8d0be5ae0d
commit 8d0be5ae0d
parent edf936f3d0
4 changed files with 160 additions and 97 deletions
--- a/README.md
+++ b/README.md
@ -1,5 +1,5 @@
 <!--
-    Time-stamp: "Last modified 2023-08-02 13:55:11 mluebke"
+    Time-stamp: "Last modified 2024-09-12 11:39:28 delucia"
 -->

 # POET
@ -87,7 +87,7 @@ follows:
 $ R

 # install R dependencies
-> install.packages(c("Rcpp", "RInside"))
+> install.packages(c("Rcpp", "RInside","qs"))
 > q(save="no")

 # cd into POET project root
@ -133,13 +133,14 @@ With the installation of POET, two executables are provided:
  - `poet` - the main executable to run simulations
  - `poet_init` - a preprocessor to generate input files for POET from R scripts

-Preprocessed benchmarks can be found in the `share/poet` directory with an
-according *runtime* setup. More on those files and how to create them later. 
+Preprocessed benchmarks can be found in the `share/poet` directory
+with an according *runtime* setup. More on those files and how to
+create them later.

 ## Running

-Run POET by `mpirun ./poet [OPTIONS] <RUNFILE> <SIMFILE> <OUTPUT_DIRECTORY>`
-where:
+Run POET by `mpirun ./poet [OPTIONS] <RUNFILE> <SIMFILE>
+<OUTPUT_DIRECTORY>` where:

 - **OPTIONS** - POET options (explained below)
 - **RUNFILE** - Runtime parameters described as R script 
@ -154,8 +155,9 @@ The following parameters can be set:
 |-----------------------------|--------------|--------------------------------------------------------------------------------------------------------------------------|
 | **--work-package-size=**    | _1..n_       | size of work packages (defaults to _5_)                                                                                  |
 | **-P, --progress**          |              | show progress bar                                                                                                        |
-| **--ai-surrogate**          |              | activates the AI surrogate chemistry model (defaults to _OFF_) |
+| **--ai-surrogate**          |              | activates the AI surrogate chemistry model (defaults to _OFF_)                                                           |
 | **--dht**                   |              | enabling DHT usage (defaults to _OFF_)                                                                                   |
+| **--qs**                    |              | store results using qs::qsave() (.qs extension) instead of default RDS (.rds)                                            |
 | **--dht-strategy=**         | _0-1_        | change DHT strategy. **NOT IMPLEMENTED YET** (Defaults to _0_)                                                           |
 | **--dht-size=**             | _1-n_        | size of DHT per process involved in megabyte (defaults to _1000 MByte_)                                                  |
 | **--dht-snaps=**            | _0-2_        | disable or enable storage of DHT snapshots                                                                               |
@ -253,12 +255,13 @@ produce any valid predictions.

 ## Defining a model

-In order to provide a model to POET, you need to setup a R script which can then
-be used by `poet_init` to generate the simulation input. Which parameters are
-required can be found in the
-[Wiki](https://git.gfz-potsdam.de/naaice/poet/-/wikis/Initialization). We try to
-keep the document up-to-date. However, if you encounter missing information or
-need help, please get in touch with us via the issue tracker or E-Mail.
+In order to provide a model to POET, you need to setup a R script
+which can then be used by `poet_init` to generate the simulation
+input. Which parameters are required can be found in the
+[Wiki](https://git.gfz-potsdam.de/naaice/poet/-/wikis/Initialization).
+We try to keep the document up-to-date. However, if you encounter
+missing information or need help, please get in touch with us via the
+issue tracker or E-Mail.

 `poet_init` can be used as follows:

@ -268,46 +271,50 @@ need help, please get in touch with us via the issue tracker or E-Mail.

 where: 

- **output** - name of the output file (defaults to the input file name
-  with the extension `.rds`)
- **setwd** - set the working directory to the directory of the input file (e.g.
-  to allow relative paths in the input script). However, the output file
-  will be stored in the directory from which `poet_init` was called.
+- **output** - name of the output file (defaults to the input file
+  name with the extension `.rds`)
+- **setwd** - set the working directory to the directory of the input
+  file (e.g. to allow relative paths in the input script). However,
+  the output file will be stored in the directory from which
+  `poet_init` was called.

 ## Additional functions for the AI surrogate

-The AI surrogate can be activated for any benchmark and is by default initiated
-as a sequential keras model with three hidden layer of depth 48, 96, 24 with
-relu activation and adam optimizer. All functions in `ai_surrogate_model.R` can
-be overridden by adding custom definitions via an R file in the input script.
-This is done by adding the path to this file in the input script. Simply add the
-path as an element called `ai_surrogate_input_script` to the `chemistry_setup`
-list. Please use the global variable `ai_surrogate_base_path` as a base path
+The AI surrogate can be activated for any benchmark and is by default
+initiated as a sequential keras model with three hidden layer of depth
+48, 96, 24 with relu activation and adam optimizer. All functions in
+`ai_surrogate_model.R` can be overridden by adding custom definitions
+via an R file in the input script. This is done by adding the path to
+this file in the input script. Simply add the path as an element
+called `ai_surrogate_input_script` to the `chemistry_setup` list.
+Please use the global variable `ai_surrogate_base_path` as a base path
 when relative filepaths are used in custom funtions.

-**There is currently no default implementation to determine the validity of
-predicted values.** This means, that every input script must include an R source
-file with a custom function `validate_predictions(predictors, prediction)`.
-Examples for custom functions can be found for the barite_200 benchmark
+**There is currently no default implementation to determine the
+validity of predicted values.** This means, that every input script
+must include an R source file with a custom function
+`validate_predictions(predictors, prediction)`. Examples for custom
+functions can be found for the barite_200 benchmark

 The functions can be defined as follows:

-`validate_predictions(predictors, prediction)`: Returns a boolean index vector
-that signals for each row in the predictions if the values are considered valid.
-Can eg. be implemented as a mass balance threshold between the predictors and
-the prediction.
+`validate_predictions(predictors, prediction)`: Returns a boolean
+index vector that signals for each row in the predictions if the
+values are considered valid. Can eg. be implemented as a mass balance
+threshold between the predictors and the prediction.

-`initiate_model()`: Returns a keras model. Can be used to load pretrained
-models.
+`initiate_model()`: Returns a keras model. Can be used to load
+pretrained models.

 `preprocess(df, backtransform = FALSE, outputs = FALSE)`: Returns the
-scaled/transformed/backtransformed dataframe. The `backtransform` flag signals
-if the current processing step is applied to data that's assumed to be scaled
-and expects backtransformed values. The `outputs` flag signals if the current
-processing step is applied to the output or tatget of the model. This can be
-used to eg. skip these processing steps and only scale the model input.
+scaled/transformed/backtransformed dataframe. The `backtransform` flag
+signals if the current processing step is applied to data that's
+assumed to be scaled and expects backtransformed values. The `outputs`
+flag signals if the current processing step is applied to the output
+or tatget of the model. This can be used to eg. skip these processing
+steps and only scale the model input.

-`training_step (model, predictor, target, validity)`: Trains the model after
-each iteration. `validity` is the bool index vector given by
-`validate_predictions` and can eg. be used to only train on values that have not
-been valid predictions.
+`training_step (model, predictor, target, validity)`: Trains the model
+after each iteration. `validity` is the bool index vector given by
+`validate_predictions` and can eg. be used to only train on values
+that have not been valid predictions.
--- a/R_lib/kin_r_library.R
+++ b/R_lib/kin_r_library.R
@ -1,4 +1,4 @@
-## Time-stamp: "Last modified 2023-08-15 11:58:23 delucia"
+## Time-stamp: "Last modified 2024-06-11 14:26:33 delucia"

 ### Copyright (C) 2018-2023 Marco De Lucia, Max Luebke (GFZ Potsdam)
 ###
@ -35,14 +35,18 @@ master_init <- function(setup, out_dir, init_field) {
    setup$iterations <- setup$maxiter
    setup$simulation_time <- 0

+    dgts <- as.integer(ceiling(log10(setup$maxiter)))
+    ## string format to use in sprintf
+    fmt <- paste0("%0", dgts, "d")
+
    if (is.null(setup[["store_result"]])) {
        setup$store_result <- TRUE
    }

    if (setup$store_result) {
-        init_field_out <- paste0(out_dir, "/iter_0.rds")
+        init_field_out <- paste0(out_dir, "/iter_", sprintf(fmt = fmt, 0), ".", setup$out_ext)
        init_field <- data.frame(init_field, check.names = FALSE)
-        saveRDS(init_field, file = init_field_out)
+        SaveRObj(x = init_field, path = init_field_out)
        msgm("Stored initial field in ", init_field_out)
        if (is.null(setup[["out_save"]])) {
            setup$out_save <- seq(1, setup$iterations)
@ -69,7 +73,7 @@ master_iteration_end <- function(setup, state_T, state_C) {
    ## comprised in setup$out_save
    if (setup$store_result) {
        if (iter %in% setup$out_save) {
-            nameout <- paste0(setup$out_dir, "/iter_", sprintf(fmt = fmt, iter), ".rds")
+            nameout <- paste0(setup$out_dir, "/iter_", sprintf(fmt = fmt, iter), ".", setup$out_ext)
            state_T <- data.frame(state_T, check.names = FALSE)
            state_C <- data.frame(state_C, check.names = FALSE)
            
@ -77,13 +81,14 @@ master_iteration_end <- function(setup, state_T, state_C) {
                prediction_time = if(exists("ai_prediction_time")) as.integer(ai_prediction_time) else NULL,
                training_time = if(exists("ai_training_time")) as.integer(ai_training_time) else NULL,
                valid_predictions = if(exists("validity_vector")) validity_vector else NULL)
-            saveRDS(list(
-                T = state_T,
-                C = state_C,
-                simtime = as.integer(setup$simulation_time),
-                totaltime = as.integer(totaltime),
-                ai_surrogate_info = ai_surrogate_info
-            ), file = nameout)
+
+            SaveRObj(x = list(
+                         T = state_T,
+                         C = state_C,
+                         simtime = as.integer(setup$simulation_time),
+                         totaltime = as.integer(totaltime),
+                         ai_surrogate_info = ai_surrogate_info
+                     ), path = nameout)
            msgm("results stored in <", nameout, ">")
        }
    }
@ -172,3 +177,30 @@ GetWorkPackageSizesVector <- function(n_packages, package_size, len) {
    ids <- rep(1:n_packages, times = package_size, each = 1)[1:len]
    return(as.integer(table(ids)))
 }
+
+
+## Handler to read R objs from binary files using either builtin
+## readRDS() or qs::qread() based on file extension
+ReadRObj <- function(path) {
+    ## code borrowed from tools::file_ext()
+    pos <- regexpr("\\.([[:alnum:]]+)$", path)
+    extension <- ifelse(pos > -1L, substring(path, pos + 1L), "")
+
+    switch(extension,
+           rds = readRDS(path),
+           qs  = qs::qread(path))
+}
+
+## Handler to store R objs to binary files using either builtin
+## saveRDS() or qs::qsave() based on file extension
+SaveRObj <- function(x, path) {
+    msgm("Storing to", path)
+    ## code borrowed from tools::file_ext()
+    pos <- regexpr("\\.([[:alnum:]]+)$", path)
+    extension <- ifelse(pos > -1L, substring(path, pos + 1L), "")
+
+    switch(extension,
+           rds = saveRDS(object = x, file=path),
+           qs  = qs::qsave(x=x, file = path))
+}
+
--- a/src/poet.cpp
+++ b/src/poet.cpp
@ -52,17 +52,23 @@ static int MY_RANK = 0;

 static std::unique_ptr<Rcpp::List> global_rt_setup;

-// we need some layz evaluation, as we can't define the functions before the R
-// runtime is initialized
+// we need some lazy evaluation, as we can't define the functions
+// before the R runtime is initialized
 static std::optional<Rcpp::Function> master_init_R;
 static std::optional<Rcpp::Function> master_iteration_end_R;
 static std::optional<Rcpp::Function> store_setup_R;
+static std::optional<Rcpp::Function> ReadRObj_R;
+static std::optional<Rcpp::Function> SaveRObj_R;
+static std::optional<Rcpp::Function> source_R;

 static void init_global_functions(RInside &R) {
  R.parseEval(kin_r_library);
-  master_init_R = Rcpp::Function("master_init");
+  master_init_R		 = Rcpp::Function("master_init");
  master_iteration_end_R = Rcpp::Function("master_iteration_end");
-  store_setup_R = Rcpp::Function("StoreSetup");
+  store_setup_R		 = Rcpp::Function("StoreSetup");
+  source_R		 = Rcpp::Function("source");
+  ReadRObj_R		 = Rcpp::Function("ReadRObj");
+  SaveRObj_R		 = Rcpp::Function("SaveRObj");
 }

 // HACK: this is a step back as the order and also the count of fields is
@ -150,8 +156,16 @@ ParseRet parseInitValues(char **argv, RuntimeParameters &params) {

  params.use_ai_surrogate = cmdl["ai-surrogate"];

+  // MDL: optional flag "qs" to switch to qsave()
+  params.out_ext = "rds";
+  if (cmdl["qs"]) {
+    MSG("Enabled <qs> output");
+    params.out_ext = "qs";
+  }
+
  if (MY_RANK == 0) {
    // MSG("Complete results storage is " + BOOL_PRINT(simparams.store_result));
+    MSG("Output format/extension is " + params.out_ext);
    MSG("Work Package Size: " + std::to_string(params.work_package_size));
    MSG("DHT is " + BOOL_PRINT(params.use_dht));
    MSG("AI Surrogate is " + BOOL_PRINT(params.use_ai_surrogate));
@ -207,18 +221,22 @@ ParseRet parseInitValues(char **argv, RuntimeParameters &params) {
  // R["dht_log"] = simparams.dht_log;

  try {
-    Rcpp::Function source("source");
-    Rcpp::Function readRDS("readRDS");
+    // Rcpp::Function source("source");
+    // Rcpp::Function ReadRObj("ReadRObj");
+    // Rcpp::Function SaveRObj("SaveRObj");

-    Rcpp::List init_params_ = readRDS(init_file);
+    Rcpp::List init_params_ = ReadRObj_R.value()(init_file);
    params.init_params = init_params_;
    
    global_rt_setup = std::make_unique<Rcpp::List>();
-    *global_rt_setup = source(runtime_file, Rcpp::Named("local", true));
+    *global_rt_setup = source_R.value()(runtime_file, Rcpp::Named("local", true));
    *global_rt_setup = global_rt_setup->operator[]("value");

+    // MDL add "out_ext" for output format to R setup
+    (*global_rt_setup)["out_ext"] = params.out_ext;
+
    params.timesteps =
-        Rcpp::as<std::vector<double>>(global_rt_setup->operator[]("timesteps"));
+      Rcpp::as<std::vector<double>>(global_rt_setup->operator[]("timesteps"));

  } catch (const std::exception &e) {
    ERRMSG("Error while parsing R scripts: " + std::string(e.what()));
@ -463,6 +481,9 @@ int main(int argc, char *argv[]) {
      MSG("Running POET version " + std::string(poet_version));
    }
    
+
+    init_global_functions(R);
+
    RuntimeParameters run_params;

    switch (parseInitValues(argv, run_params)) {
@ -501,32 +522,33 @@ int main(int argc, char *argv[]) {
    if (MY_RANK > 0) {
      chemistry.WorkerLoop();
    } else {
-      init_global_functions(R);
      // R.parseEvalQ("mysetup <- setup");
      // // if (MY_RANK == 0) { // get timestep vector from
      // // grid_init function ... //
      *global_rt_setup =
-          master_init_R.value()(*global_rt_setup, run_params.out_dir,
-                                init_list.getInitialGrid().asSEXP());
+	master_init_R.value()(*global_rt_setup, run_params.out_dir,
+			      init_list.getInitialGrid().asSEXP());
      // MDL: store all parameters
      // MSG("Calling R Function to store calling parameters");
      // R.parseEvalQ("StoreSetup(setup=mysetup)");
+      R["out_ext"] = run_params.out_ext;
+      R["out_dir"] = run_params.out_dir;
+
      if (run_params.use_ai_surrogate) {
        /* Incorporate ai surrogate from R */
        R.parseEvalQ(ai_surrogate_r_library);
        /* Use dht species for model input and output */
        R["ai_surrogate_species"] = init_list.getChemistryInit().dht_species.getNames();
-        R["out_dir"] = run_params.out_dir;

        const std::string ai_surrogate_input_script = init_list.getChemistryInit().ai_surrogate_input_script;
 	
-    	  MSG("AI: sourcing user-provided script");
-	      R.parseEvalQ(ai_surrogate_input_script);
+	MSG("AI: sourcing user-provided script");
+	R.parseEvalQ(ai_surrogate_input_script);
 	
        MSG("AI: initialize AI model");
-	      R.parseEval("model <- initiate_model()");
+	R.parseEval("model <- initiate_model()");
        R.parseEval("gpu_info()");
-        }
+      }
      
      MSG("Init done on process with rank " + std::to_string(MY_RANK));
      
@ -543,14 +565,15 @@ int main(int argc, char *argv[]) {
      
      R["profiling"] = profiling;
      R["setup"] = *global_rt_setup;
+      R["setup$out_ext"] = run_params.out_ext;

      string r_vis_code;
      r_vis_code =
-	"saveRDS(profiling, file=paste0(setup$out_dir,'/timings.rds'));";
+	"SaveRObj(x = profiling, path = paste0(out_dir, '/timings.', setup$out_ext));";
      R.parseEval(r_vis_code);
      
      MSG("Done! Results are stored as R objects into <" + run_params.out_dir +
-          "/timings.rds>");
+          "/timings." + run_params.out_ext);
    }
  }

--- a/src/poet.hpp.in
+++ b/src/poet.hpp.in
@ -39,7 +39,7 @@ static const inline std::string ai_surrogate_r_library = R"(@R_AI_SURROGATE_LIB@
 static const inline std::string r_runtime_parameters = "mysetup";

 const std::set<std::string> flaglist{"ignore-result", "dht", "P", "progress",
-                                     "interp", "ai-surrogate"};
+                                     "interp", "ai-surrogate", "qs"};
 const std::set<std::string> paramlist{
    "work-package-size", "dht-strategy", "dht-size",   "dht-snaps",
    "dht-file",          "interp-size",  "interp-min", "interp-bucket-entries"};
@ -51,6 +51,7 @@ constexpr uint32_t CHEM_DHT_SIZE_PER_PROCESS_MB = 1.5E3;
 struct RuntimeParameters {
  std::string out_dir;
  std::vector<double> timesteps;
+  std::string out_ext; // MDL added to accomodate for qs::qsave/qread

  bool print_progressbar;
  uint32_t work_package_size;