update benchmarks for ai surrogate

2025-12-15 12:28:22 +01:00 · 2025-12-10 19:59:15 +01:00 · 2025-12-10 19:59:15 +01:00 · 49135615d1
commit 49135615d1
parent c7d1fc152c
16 changed files with 160 additions and 305 deletions
--- a/bench/barite/CMakeLists.txt
+++ b/bench/barite/CMakeLists.txt
@ -1,12 +1,10 @@
 # Create a list of files 
 set(bench_files
    barite_200.R
-    barite_het.R
 )

 set(runtime_files
    barite_200_rt.R
-    barite_het_rt.R
 )

 # add_custom_target(barite_bench DEPENDS ${bench_files} ${runtime_files})
--- a/bench/barite/barite_200_surrogate_input_script.R
+++ b/bench/barite/barite_200_surrogate_input_script.R
@ -0,0 +1,82 @@
+scale_min_max <- function(x, min, max, backtransform) {
+    if (backtransform) {
+        return((x * (max - min)) + min)
+    } else {
+        return((x - min) / (max - min))
+    }
+}
+
+scale_standardizer <- function(x, mean, scale, backtransform) {
+    if(backtransform){
+        return(x * scale + mean)
+    }
+    else{
+        return((x-mean) / scale)
+    }
+}
+
+standard <- list(mean = c(H = 111.01243361730982, O= 55.50673140754027, Ba= 0.0016161137065825058,
+                         Cl= 0.0534503766678322, S=0.00012864849674669584, Sr=0.0252377348949622, 
+                         Barite_kin=0.05292312117000998, Celestite_kin=0.9475491659328229), 
+                scale = c(H=1.0, O=0.00048139729680698453, Ba=0.008945717576237102, Cl=0.03587363709464328,
+                         S=0.00012035100591827131, Sr=0.01523052668095922, Barite_kin=0.21668648247230615,
+                         Celestite_kin=0.21639449682671968))
+
+minmax <- list(min = c(H = 111.012433592824, O = 55.5062185549492, Charge = -3.1028354471876e-08, 
+                       Ba = 1.87312878574393e-141, Cl = 0, `S(6)` = 4.24227510643685e-07, 
+                       Sr = 0.00049382996130541, Barite = 0.000999542409828586, Celestite = 0.244801877115968),
+               max = c(H = 111.012433679682, O = 55.5087003521685, Charge = 5.27666636082035e-07, 
+                       Ba = 0.0908849779513762, Cl = 0.195697626449355, `S(6)` = 0.000620774752665846, 
+                       Sr = 0.0558680070692722, Barite = 0.756779139057097, Celestite = 1.00075422160624
+                       ))
+
+ai_surrogate_species_input = c("H", "O", "Ba", "Cl", "S", "Sr", "Barite_kin", "Celestite_kin")
+ai_surrogate_species_output = c("O", "Ba", "S", "Sr", "Barite_kin", "Celestite_kin")
+
+
+threshold <- list(species = "Cl", value = 2E-10)
+
+preprocess <- function(df) {
+    if (!is.data.frame(df))
+        df <- as.data.frame(df, check.names = FALSE)
+    
+    as.data.frame(lapply(colnames(df),
+                         function(x) scale_standardizer(x=df[x],
+                                                   mean=standard$mean[x],
+                                                   scale=standard$scale[x],
+                                                   backtransform=FALSE)),
+                  check.names = FALSE)
+}
+
+postprocess <- function(df) {
+    if (!is.data.frame(df))
+        df <- as.data.frame(df, check.names = FALSE)
+    
+    as.data.frame(lapply(colnames(df),
+                         function(x) scale_standardizer(x=df[x],
+                                                   mean=standard$mean[x],
+                                                   scale=standard$scale[x],
+                                                   backtransform=TRUE)),
+                  check.names = FALSE)
+}
+
+mass_balance <- function(predictors, prediction) {
+    dBa <- abs(prediction$Ba + prediction$Barite_kin -
+               predictors$Ba - predictors$Barite_kin)
+    dSr <- abs(prediction$Sr + prediction$Celestite_kin -
+               predictors$Sr - predictors$Celestite_kin)
+    dS <- abs(prediction$S + prediction$Celestite_kin + prediction$Barite_kin - 
+              predictors$S - predictors$Celestite_kin - predictors$Barite_kin)
+    return(dBa + dSr + dS)
+}
+
+validate_predictions <- function(predictors, prediction) {
+    epsilon <- 1E-5
+    mb <- mass_balance(predictors, prediction)
+    msgm("Mass balance mean:", mean(mb))
+    msgm("Mass balance variance:", var(mb))
+    ret <- mb < epsilon
+    msgm("Rows where mass balance meets threshold", epsilon, ":",
+         sum(ret), "/", nrow(predictors))
+    return(ret)
+}
--- a/bench/barite/barite_200ai_surrogate_input_script.R
+++ b/bench/barite/barite_200ai_surrogate_input_script.R
@ -1,48 +0,0 @@
-## load a pretrained model from tensorflow file
-## Use the global variable "ai_surrogate_base_path" when using file paths
-## relative to the input script
-initiate_model <- function() {
-  init_model <- normalizePath(paste0(ai_surrogate_base_path,
-                                     "model_min_max_float64.keras"))
-  return(load_model_tf(init_model))
-}
-
-scale_min_max <- function(x, min, max, backtransform) {
-  if (backtransform) {
-    return((x * (max - min)) + min)
-  } else {
-    return((x - min) / (max - min))
-  }
-}
-
-preprocess <- function(df, backtransform = FALSE, outputs = FALSE) {
-  minmax_file <- normalizePath(paste0(ai_surrogate_base_path,
-                                      "min_max_bounds.rds"))
-  global_minmax <- readRDS(minmax_file)
-  for (column in colnames(df)) {
-    df[column] <- lapply(df[column],
-                         scale_min_max,
-                         global_minmax$min[column],
-                         global_minmax$max[column],
-                         backtransform)
-  }
-  return(df)
-}
-
-mass_balance <- function(predictors, prediction) {
-  dBa <- abs(prediction$Ba + prediction$Barite -
-             predictors$Ba - predictors$Barite)
-  dSr <- abs(prediction$Sr + prediction$Celestite -
-             predictors$Sr - predictors$Celestite)
-  return(dBa + dSr)
-}
-
-validate_predictions <- function(predictors, prediction) {
-  epsilon <- 3e-5
-  mb <- mass_balance(predictors, prediction)
-  msgm("Mass balance mean:", mean(mb))
-  msgm("Mass balance variance:", var(mb))
-  msgm("Rows where mass balance meets threshold", epsilon, ":",
-       sum(mb < epsilon))
-  return(mb < epsilon)
-}
--- a/bench/barite/barite_50ai_all.keras
+++ b/bench/barite/barite_50ai_all.keras
--- a/bench/barite/barite_het.R
+++ b/bench/barite/barite_het.R
@ -1,32 +0,0 @@
-grid_def <- matrix(c(2, 3), nrow = 2, ncol = 5)
-
-# Define grid configuration for POET model
-grid_setup <- list(
-    pqc_in_file = "./barite_het.pqi",
-    pqc_db_file = "./db_barite.dat", # Path to the database file for Phreeqc
-    grid_def = grid_def, # Definition of the grid, containing IDs according to the Phreeqc input script
-    grid_size = c(ncol(grid_def), nrow(grid_def)), # Size of the grid in meters
-    constant_cells = c() # IDs of cells with constant concentration
-)
-
-diffusion_setup <- list(
-    boundaries = list(
-        "W" = list(
-            "type" = rep("constant", nrow(grid_def)),
-            "sol_id" = rep(4, nrow(grid_def)),
-            "cell" = seq_len(nrow(grid_def))
-        )
-    ),
-    alpha_x = 1e-6,
-    alpha_y = matrix(runif(10, 1e-8, 1e-7),
-        nrow = nrow(grid_def),
-        ncol = ncol(grid_def)
-    )
-)
-
-# Define a setup list for simulation configuration
-setup <- list(
-    Grid = grid_setup, # Parameters related to the grid structure
-    Diffusion = diffusion_setup, # Parameters related to the diffusion process
-    Chemistry = list()
-)
--- a/bench/barite/barite_het.pqi
+++ b/bench/barite/barite_het.pqi
@ -1,80 +0,0 @@
-## Initial: everywhere equilibrium with Celestite NB: The aqueous
-## solution *resulting* from this calculation is to be used as initial
-## state everywhere in the domain
-SOLUTION 1
-units	mol/kgw
-water	1
-temperature	25
-pH	7
-pe	4
-S(6)	1e-12
-Sr	1e-12
-Ba      1e-12
-Cl      1e-12
-PURE 1
-Celestite 0.0 1
-
-SAVE SOLUTION 2 # <- phreeqc keyword to store and later reuse these results
-END
-
-RUN_CELLS 
-   -cells 1
-
-COPY solution 1 2-3
-
-## Here a 5x2 domain:
-
-       |---+---+---+---+---|
-    -> | 2 | 2 | 2 | 2 | 2 |
-     4 |---+---+---+---+---|
-    -> | 3 | 3 | 3 | 3 | 3 |
-       |---+---+---+---+---|
-
-## East boundary: "injection" of solution 4. North, W, S boundaries: closed
-
-## Here the two distinct zones: nr 2 with kinetics Celestite (initial
-## amount is 0, is then allowed to precipitate) and nr 3 with kinetic
-## Celestite and Barite (both initially > 0) where the actual
-## replacement takes place
-
-#USE SOLUTION 2  <- PHREEQC keyword to reuse the results from previous calculation
-KINETICS 2
-Celestite
-m  0    # Allowed to precipitate
-parms 10.0
-tol 1e-9
-
-END
-
-#USE SOLUTION 2  <- PHREEQC keyword to reuse the results from previous calculation
-KINETICS 3
-Barite
-m 0.001
-parms 50. 
-tol 1e-9
-Celestite
-m 1
-parms 10.0 
-tol 1e-9
-END
-
-## A BaCl2 solution (nr 4) is "injected" from the left boundary:
-SOLUTION 4
-units	mol/kgw
-pH 7
-water 1
-temp 25
-Ba  0.1
-Cl  0.2
-END
-## NB: again, the *result* of the SOLUTION 4 script defines the
-## concentration of all elements (+charge, tot H, tot O)
-
-## Ideally, in the initial state SOLUTION 1 we should not have to
-## define the 4 elemental concentrations (S(6), Sr, Ba and Cl) but
-## obtain them having run once the scripts with the aqueous solution
-## resulting from SOLUTION 1 once with KINETICS 2 and once with
-## KINETICS 3.
-
-RUN_CELLS
- -cells 2-4
--- a/bench/barite/barite_het_rt.R
+++ b/bench/barite/barite_het_rt.R
@ -1,4 +0,0 @@
-list(
-    timesteps = rep(50, 100),
-    store_result = TRUE
-)
--- a/bench/barite/barite_trained.weights.h5
+++ b/bench/barite/barite_trained.weights.h5
--- a/bench/dolo/CMakeLists.txt
+++ b/bench/dolo/CMakeLists.txt
@ -1,11 +1,9 @@
 set(bench_files
-    dolo_inner_large.R
    dolo_interp.R
 )

 set(runtime_files
-    dolo_inner_large_rt.R
-    dolo_interp_rt.R
+    dolo_interp_rt_dt20000.R
 )

 ADD_BENCH_TARGET(
--- a/bench/dolo/dolo_inner.rds
+++ b/bench/dolo/dolo_inner.rds
--- a/bench/dolo/dolo_inner_large.R
+++ b/bench/dolo/dolo_inner_large.R
@ -1,115 +0,0 @@
-rows <- 2000
-cols <- 1000
-
-grid_def <- matrix(2, nrow = rows, ncol = cols)
-
-# Define grid configuration for POET model
-grid_setup <- list(
-    pqc_in_file = "./dol.pqi",
-    pqc_db_file = "./phreeqc_kin.dat", # Path to the database file for Phreeqc
-    grid_def = grid_def, # Definition of the grid, containing IDs according to the Phreeqc input script
-    grid_size = c(cols, rows) / 100, # Size of the grid in meters
-    constant_cells = c() # IDs of cells with constant concentration
-)
-
-bound_size <- 2
-
-diffusion_setup <- list(
-    inner_boundaries = list(
-        "row" = c(400, 1400, 1600),
-        "col" = c(200, 800, 800),
-        "sol_id" = c(3, 4, 4)
-    ),
-    alpha_x = 1e-6,
-    alpha_y = 1e-6
-)
-
-check_sign_cal_dol_dht <- function(old, new) {
-    if ((old["Calcite"] == 0) != (new["Calcite"] == 0)) {
-        return(TRUE)
-    }
-    if ((old["Dolomite"] == 0) != (new["Dolomite"] == 0)) {
-        return(TRUE)
-    }
-    return(FALSE)
-}
-
-fuzz_input_dht_keys <- function(input) {
-    dht_species <- c(
-        "H" = 3,
-        "O" = 3,
-        "Charge" = 3,
-        "C(4)" = 6,
-        "Ca" = 6,
-        "Cl" = 3,
-        "Mg" = 5,
-        "Calcite" = 4,
-        "Dolomite" = 4
-    )
-    return(input[names(dht_species)])
-}
-
-check_sign_cal_dol_interp <- function(to_interp, data_set) {
-    dht_species <- c(
-        "H" = 3,
-        "O" = 3,
-        "Charge" = 3,
-        "C(4)" = 6,
-        "Ca" = 6,
-        "Cl" = 3,
-        "Mg" = 5,
-        "Calcite" = 4,
-        "Dolomite" = 4
-    )
-    data_set <- as.data.frame(do.call(rbind, data_set), check.names = FALSE, optional = TRUE)
-    names(data_set) <- names(dht_species)
-    cal <- (data_set$Calcite == 0) == (to_interp["Calcite"] == 0)
-    dol <- (data_set$Dolomite == 0) == (to_interp["Dolomite"] == 0)
-
-    cal_dol_same_sig <- cal == dol
-    return(rev(which(!cal_dol_same_sig)))
-}
-
-check_neg_cal_dol <- function(result) {
-    neg_sign <- (result["Calcite"] < 0) || (result["Dolomite"] < 0)
-    return(neg_sign)
-}
-
-# Optional when using Interpolation (example with less key species and custom
-# significant digits)
-
-pht_species <- c(
-    "C(4)" = 3,
-    "Ca" = 3,
-    "Mg" = 2,
-    "Calcite" = 2,
-    "Dolomite" = 2
-)
-
-chemistry_setup <- list(
-    dht_species = c(
-        "H" = 3,
-        "O" = 3,
-        "Charge" = 3,
-        "C(4)" = 6,
-        "Ca" = 6,
-        "Cl" = 3,
-        "Mg" = 5,
-        "Calcite" = 4,
-        "Dolomite" = 4
-    ),
-    pht_species = pht_species,
-    hooks = list(
-        dht_fill = check_sign_cal_dol_dht,
-        dht_fuzz = fuzz_input_dht_keys,
-        interp_pre = check_sign_cal_dol_interp,
-        interp_post = check_neg_cal_dol
-    )
-)
-
-# Define a setup list for simulation configuration
-setup <- list(
-    Grid = grid_setup, # Parameters related to the grid structure
-    Diffusion = diffusion_setup, # Parameters related to the diffusion process
-    Chemistry = chemistry_setup # Parameters related to the chemistry process
-)
--- a/bench/dolo/dolo_inner_large_rt.R
+++ b/bench/dolo/dolo_inner_large_rt.R
@ -1,10 +0,0 @@
-iterations <- 500
-dt <- 50
-
-out_save <- seq(5, iterations, by = 5)
-
-list(
-    timesteps = rep(dt, iterations),
-    store_result = TRUE,
-    out_save = out_save
-)
--- a/bench/dolo/dolo_interp.R
+++ b/bench/dolo/dolo_interp.R
@ -7,6 +7,7 @@ grid_def <- matrix(2, nrow = rows, ncol = cols)
 grid_setup <- list(
    pqc_in_file = "./dol.pqi",
    pqc_db_file = "./phreeqc_kin.dat",
+    pqc_with_redox = TRUE,
    grid_def = grid_def,
    grid_size = c(5, 2.5),
    constant_cells = c()
@ -120,7 +121,8 @@ chemistry_setup <- list(
        ## dht_fuzz = fuzz_input_dht_keys,
        interp_pre = check_sign_cal_dol_interp,
        interp_post = check_neg_cal_dol
-    )
+    ),
+    ai_surrogate_input_script = "./dolo_surrogate_input_script.R"
 )

 ## Define a setup list for simulation configuration
--- a/bench/dolo/dolo_interp_rt.R
+++ b/bench/dolo/dolo_interp_rt.R
@ -1,10 +0,0 @@
-iterations <- 2000
-dt <- 200
-
-out_save <- c(1, 10, 20, seq(40, iterations, by = 40))
-
-list(
-    timesteps = rep(dt, iterations),
-    store_result = TRUE,
-    out_save = out_save
-)
--- a/bench/dolo/dolo_surrogate_input_script.R
+++ b/bench/dolo/dolo_surrogate_input_script.R
@ -0,0 +1,74 @@
+scale_min_max <- function(x, min, max, backtransform) {
+    if (backtransform) {
+        return((x * (max - min)) + min)
+    } else {
+        return((x - min) / (max - min))
+    }
+}
+
+scale_standardizer <- function(x, mean, scale, backtransform) {
+    if(backtransform){
+        return(x * scale + mean)
+    }
+    else{
+        return((x-mean) / scale)
+    }
+}
+
+standard <- list(mean = c(H = 111.0124335959659, O=55.5065739707202, 'C(-4)'=1.5788555695339323e-15, 'C(4)'=0.00011905649680154037,
+                         Ca= 0.00012525858032576948, Cl=0.00010368471137502122, Mg=4.5640346338857756e-05, Calcite_kin=0.0001798444527389999, 
+                         Dolomite_kin=7.6152065281986634e-06), 
+                scale = c(H=1.0, O=3.54850912318837e-05, 'C(-4)'=2.675559053860093e-14, 'C(4)'=1.1829735682920146e-05, Ca=1.207381343127647e-05, Cl=0.00024586541554245565,
+                         Mg=0.00011794307217698012, Calcite_kin=5.946457663332385e-05, Dolomite_kin=2.688201435907049e-05))
+
+
+ai_surrogate_species_input = c("H", "O", "C(-4)", "C(4)", "Ca", "Cl", "Mg", "Calcite_kin", "Dolomite_kin")
+ai_surrogate_species_output = c("H", "O", "C(-4)", "C(4)", "Ca", "Mg", "Calcite_kin", "Dolomite_kin")
+
+
+threshold <- list(species = "Cl", value = 2E-10)
+
+preprocess <- function(df) {
+    if (!is.data.frame(df))
+        df <- as.data.frame(df, check.names = FALSE)
+    
+    as.data.frame(lapply(colnames(df),
+                         function(x) scale_standardizer(x=df[x],
+                                                   mean=standard$mean[x],
+                                                   scale=standard$scale[x],
+                                                   backtransform=FALSE)),
+                  check.names = FALSE)
+}
+
+postprocess <- function(df) {
+    if (!is.data.frame(df))
+        df <- as.data.frame(df, check.names = FALSE)
+    
+    as.data.frame(lapply(colnames(df),
+                         function(x) scale_standardizer(x=df[x],
+                                                   mean=standard$mean[x],
+                                                   scale=standard$scale[x],
+                                                   backtransform=TRUE)),
+                  check.names = FALSE)
+}
+
+mass_balance <- function(predictors, prediction) {
+    dCa <- abs(prediction$Ca + prediction$Calcite_kin + prediction$Dolomite_kin -
+               predictors$Ca - predictors$Calcite_kin - predictors$Dolomite_kin)
+    dC <- abs(prediction$'C(-4)' + prediction$'C(4)' + prediction$Calcite_kin + 2 * prediction$Dolomite_kin 
+                - predictors$'C(-4)' - predictors$'C(4)' - predictors$Calcite_kin - 2 * predictors$Dolomite_kin)
+    dMg <- abs(prediction$Mg + prediction$Dolomite_kin - 
+              predictors$Mg - predictors$Dolomite_kin)
+    return(dCa + dC + dMg)
+}
+
+validate_predictions <- function(predictors, prediction) {
+    epsilon <- 1E-8
+    mb <- mass_balance(predictors, prediction)
+    msgm("Mass balance mean:", mean(mb))
+    msgm("Mass balance variance:", var(mb))
+    ret <- mb < epsilon
+    msgm("Rows where mass balance meets threshold", epsilon, ":",
+         sum(ret))
+    return(ret)
+}
--- a/bench/dolo/dolomite_trained.weights.h5
+++ b/bench/dolo/dolomite_trained.weights.h5