Removed debugging logs, added cell_id to metric data

This commit is contained in:
rastogi 2025-11-23 17:27:11 +01:00
parent 2d3b91fbe1
commit 92e4414bfa
9 changed files with 40 additions and 39 deletions

View File

@ -113,23 +113,4 @@ setup <- list(
Grid = grid_setup, # Parameters related to the grid structure
Diffusion = diffusion_setup, # Parameters related to the diffusion process
Chemistry = chemistry_setup # Parameters related to the chemistry process
)
iterations <- 15000
dt <- 200
checkpoint_interval <- 100
control_interval <- 100
mape_threshold <- rep(0.1, 13)
mape_threshold[5] <- 1 #Charge
out_save <- seq(1000, iterations, by = 1000)
#out_save = c(seq(1, 10), seq(10, 100, by= 10), seq(200, iterations, by=100))
list(
timesteps = rep(dt, iterations),
store_result = TRUE,
out_save = out_save,
checkpoint_interval = checkpoint_interval,
control_interval = control_interval,
mape_threshold = mape_threshold
)

View File

@ -1,7 +1,7 @@
#!/bin/bash
#SBATCH --job-name=dolo_proto1_eps01_no_zeroabs
#SBATCH --output=dolo_proto1_eps01_no_zeroabs_%j.out
#SBATCH --error=dolo_proto1_eps01_no_zeroabs_%j.err
#SBATCH --job-name=proto2_eps01_3_rb
#SBATCH --output=proto2_eps01_3_rb_%j.out
#SBATCH --error=proto2_eps01_3_rb_%j.err
#SBATCH --partition=long
#SBATCH --nodes=6
#SBATCH --ntasks-per-node=24
@ -15,5 +15,5 @@ module purge
module load cmake gcc openmpi
#mpirun -n 144 ./poet dolo_fgcs_3.R dolo_fgcs_3.qs2 dolo_only_pqc
mpirun -n 144 ./poet --interp dolo_fgcs_3_rt.R dolo_fgcs_3.qs2 dolo_proto1_eps01_no_zeroabs
mpirun -n 144 ./poet --interp dolo_fgcs_3_rt.R dolo_fgcs_3.qs2 proto2_eps01_3_rb
#mpirun -n 144 ./poet --interp barite_fgcs_4_new/barite_fgcs_4_new_rt.R barite_fgcs_4_new/barite_fgcs_4_new.qs2 barite

Binary file not shown.

View File

@ -522,8 +522,6 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
chem_field = out_vec;
/* do master stuff */
std::cout << "[DEBUG] control_batch.size() = " << this->control_batch.size()
<< std::endl;
if (!this->control_batch.empty()) {
std::cout << "[Master] Processing " << this->control_batch.size()

View File

@ -151,8 +151,6 @@ void poet::ChemistryModule::processCtrlPkgs(
WorkerRunWorkPackage(control_wp, current_sim_time, dt);
phreeqc_end = MPI_Wtime();
std::cout << "PQC RAN" << std::endl;
timings.ctrl_phreeqc_t += phreeqc_end - phreeqc_start;
copyPkgs(control_wp, mpi_buffer);

View File

@ -96,8 +96,8 @@ void poet::ControlModule::writeErrorMetrics(
stats_a = MPI_Wtime();
writeSpeciesStatsToCSV(metrics_history, species, out_dir,
"species_overview.csv");
writeCellStatsToCSV(cell_metrics_history, species, out_dir,
"cell_overview.csv");
// writeCellStatsToCSV(cell_metrics_history, species, out_dir,
// "cell_overview.csv");
write_metrics(cell_metrics_history, species, out_dir,
"metrics_overview.hdf5");
stats_b = MPI_Wtime();
@ -110,6 +110,7 @@ uint32_t poet::ControlModule::getRollbackIter() {
uint32_t last_iter = ((global_iteration - 1) / config.checkpoint_interval) *
config.checkpoint_interval;
/*
uint32_t rollback_iter = (last_iter <= last_checkpoint_written)
? last_iter
: last_checkpoint_written;
@ -120,6 +121,8 @@ uint32_t poet::ControlModule::getRollbackIter() {
", returning=" + std::to_string(last_checkpoint_written));
return last_checkpoint_written;
*/
return last_iter;
}
std::optional<uint32_t> poet::ControlModule::getRollbackTarget(
@ -140,7 +143,8 @@ std::optional<uint32_t> poet::ControlModule::getRollbackTarget(
for (size_t sp_i = 2; sp_i < species.size(); sp_i++) {
if (s_mape[sp_i] == 0) {
// skip charge
if (s_mape[sp_i] == 0 || sp_i == 4) {
continue;
}
if (s_mape[sp_i] > config.mape_threshold[sp_i]) {
@ -201,6 +205,8 @@ void poet::ControlModule::computeErrorMetrics(
for (size_t cell_i = 0; cell_i < n_cells; cell_i++) {
c_metrics.id.push_back(reference_values[cell_i][0]);
c_metrics.mape[cell_i][0] = reference_values[cell_i][0];
c_metrics.rrmse[cell_i][0] = reference_values[cell_i][0];
for (size_t sp_i = 2; sp_i < n_species; sp_i++) {
const double ref_value = reference_values[cell_i][sp_i];
@ -226,6 +232,14 @@ void poet::ControlModule::computeErrorMetrics(
c_metrics.mape[cell_i][sp_i] = 100.0 * std::abs(alpha);
c_metrics.rrmse[cell_i][sp_i] = alpha * alpha;
// Log extreme MAPE values for debugging
if (c_metrics.mape[cell_i][sp_i] > 100.0) {
std::cout << "WARNING: High MAPE detected - Cell="
<< c_metrics.id[cell_i] << ", Species=" << species[sp_i]
<< ", MAPE=" << c_metrics.mape[cell_i][sp_i]
<< "%, Ref=" << ref_value << ", Sur=" << sur_value
<< ", Alpha=" << alpha << std::endl;
}
}
}
}
@ -242,7 +256,7 @@ void poet::ControlModule::processCheckpoint(
DiffusionModule &diffusion, uint32_t &current_iter,
const std::string &out_dir, const std::vector<std::string> &species) {
if (flush_request) {
if (flush_request && rollback_count < 3) {
uint32_t target = getRollbackIter();
readCheckpoint(diffusion, current_iter, target, out_dir);

View File

@ -27,9 +27,13 @@ int write_metrics(const std::vector<poet::CellErrorMetrics> &metrics_history,
for (size_t idx = 0; idx < metrics_history.size(); ++idx) {
const auto &metrics = metrics_history[idx];
std::string grp = "entry_" + std::to_string(idx) + "_iter_" +
/*
std::string grp = "entry_" + std::to_string(idx) + "_iter_" +
std::to_string(metrics.iteration) + "_rb_" +
std::to_string(metrics.rollback_count);
*/
std::string grp = "iter_" + std::to_string(metrics.iteration) + "_rb_" +
std::to_string(metrics.rollback_count);
size_t n_cells = metrics.id.size();
size_t n_species = metrics.mape[0].size();
@ -37,24 +41,29 @@ int write_metrics(const std::vector<poet::CellErrorMetrics> &metrics_history,
H5Easy::dump(file, grp + "/meta", 0, H5Easy::DumpMode::Overwrite);
// Attach attributes
H5Easy::dumpAttribute(file, grp + "/meta", "species_names", species_names, H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "iteration", metrics.iteration, H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "species_names", species_names,
H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "iteration", metrics.iteration,
H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "rollback_count",
metrics.rollback_count, H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "n_cells", n_cells, H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "n_species", n_species, H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "n_cells", n_cells,
H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "n_species", n_species,
H5Easy::DumpMode::Overwrite);
// ─────────────────────────────────────────────
// 2. Real datasets
// ─────────────────────────────────────────────
H5Easy::dump(file, grp + "/mape", metrics.mape, H5Easy::DumpMode::Overwrite);
H5Easy::dump(file, grp + "/rrmse", metrics.rrmse, H5Easy::DumpMode::Overwrite);
H5Easy::dump(file, grp + "/mape", metrics.mape,
H5Easy::DumpMode::Overwrite);
H5Easy::dump(file, grp + "/rrmse", metrics.rrmse,
H5Easy::DumpMode::Overwrite);
}
return 0;
}
void writeCellStatsToCSV(const std::vector<poet::CellErrorMetrics> &all_stats,
const std::vector<std::string> &species_names,
const std::string &out_dir,

View File

@ -1,4 +1,5 @@
#include <string>
#include <vector>
void writeSpeciesStatsToCSV(
const std::vector<poet::SpeciesErrorMetrics> &all_stats,