Metrics IO, rollback fixes, CSV writers, HDF5 overwrite

This commit is contained in:
rastogi 2025-11-20 18:47:15 +01:00
parent 40ece6cba3
commit 2d3b91fbe1
6 changed files with 154 additions and 81 deletions

View File

@ -550,7 +550,7 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
metrics_a = MPI_Wtime();
control->computeErrorMetrics(this->control_batch, surrogate_batch,
prop_names);
prop_names, n_cells);
control->writeErrorMetrics(ctrl_file_out_dir, prop_names);
metrics_b = MPI_Wtime();

View File

@ -94,8 +94,12 @@ void poet::ControlModule::writeErrorMetrics(
double stats_a, stats_b;
stats_a = MPI_Wtime();
writeStatsToCSV(metrics_history, species, out_dir, "overview");
write_metrics(metrics_history, species, out_dir, "metrics_overview");
writeSpeciesStatsToCSV(metrics_history, species, out_dir,
"species_overview.csv");
writeCellStatsToCSV(cell_metrics_history, species, out_dir,
"cell_overview.csv");
write_metrics(cell_metrics_history, species, out_dir,
"metrics_overview.hdf5");
stats_b = MPI_Wtime();
this->stats_t += stats_b - stats_a;
@ -132,30 +136,40 @@ std::optional<uint32_t> poet::ControlModule::getRollbackTarget(
return std::nullopt;
}
const auto &mape = metrics_history.back().mape;
for (size_t row = 0; row < mape.size(); row++) {
for (size_t col = 0; col < species.size() && col < mape[row].size();
col++) {
const auto &s_mape = metrics_history.back().mape;
if (mape[row][col] == 0) {
continue;
for (size_t sp_i = 2; sp_i < species.size(); sp_i++) {
if (s_mape[sp_i] == 0) {
continue;
}
if (s_mape[sp_i] > config.mape_threshold[sp_i]) {
if (last_checkpoint_written == 0) {
MSG(" Threshold exceeded but no checkpoint exists yet.");
return std::nullopt;
}
if (mape[row][col] > config.mape_threshold[col]) {
const auto &c_mape = cell_metrics_history.back().mape;
const auto &c_id = cell_metrics_history.back().id;
if (last_checkpoint_written == 0) {
MSG(" Threshold exceeded but no checkpoint exists yet.");
return std::nullopt;
}
rollback_enabled = true;
flush_request = true;
auto max_it = std::max_element(
c_mape.begin(), c_mape.end(),
[sp_i](const auto &a, const auto &b) { return a[sp_i] < b[sp_i]; });
MSG("Threshold exceeded " + species[col] + " has MAPE = " +
std::to_string(mape[row][col]) + " exceeding threshold = " +
std::to_string(config.mape_threshold[col]));
size_t max_idx = std::distance(c_mape.begin(), max_it);
uint32_t cell_id = c_id[max_idx];
double cell_mape = (*max_it)[sp_i];
return getRollbackIter();
}
rollback_enabled = true;
flush_request = true;
MSG("Threshold exceeded for " + species[sp_i] +
" with species-level MAPE = " + std::to_string(s_mape[sp_i]) +
" exceeding threshold = " +
std::to_string(config.mape_threshold[sp_i]) + ". Worst cell: ID=" +
std::to_string(cell_id) + " with MAPE=" + std::to_string(cell_mape));
return getRollbackIter();
}
}
rollback_enabled = false;
@ -166,7 +180,7 @@ std::optional<uint32_t> poet::ControlModule::getRollbackTarget(
void poet::ControlModule::computeErrorMetrics(
std::vector<std::vector<double>> &reference_values,
std::vector<std::vector<double>> &surrogate_values,
const std::vector<std::string> &species) {
const std::vector<std::string> &species, const uint32_t size_per_prop) {
// Skip metric computation if already in rollback/stabilization phase
if (rollback_enabled) {
@ -174,36 +188,54 @@ void poet::ControlModule::computeErrorMetrics(
}
const uint32_t n_cells = reference_values.size();
const uint32_t n_species = species.size();
const double ZERO_ABS = config.zero_abs;
SpeciesErrorMetrics metrics(n_cells, species.size(), global_iteration,
rollback_count);
CellErrorMetrics c_metrics(n_cells, n_species, global_iteration,
rollback_count);
SpeciesErrorMetrics s_metrics(n_species, global_iteration, rollback_count);
std::vector<double> species_err_sum(n_species, 0.0);
std::vector<double> species_sqr_sum(n_species, 0.0);
for (size_t cell_i = 0; cell_i < n_cells; cell_i++) {
metrics.id.push_back(reference_values[cell_i][0]);
c_metrics.id.push_back(reference_values[cell_i][0]);
for (size_t sp_i = 0; sp_i < species.size(); sp_i++) {
const double ref_value = reference_values[cell_i][sp_i + 1];
const double sur_value = surrogate_values[cell_i][sp_i + 1];
const double ZERO_ABS = config.zero_abs;
for (size_t sp_i = 2; sp_i < n_species; sp_i++) {
const double ref_value = reference_values[cell_i][sp_i];
const double sur_value = surrogate_values[cell_i][sp_i];
if (std::isnan(ref_value) || std::isnan(sur_value)) {
continue;
}
if (std::abs(ref_value) < ZERO_ABS) {
if (std::abs(sur_value) >= ZERO_ABS) {
metrics.mape[cell_i][sp_i] = 100.0;
metrics.rrmse[cell_i][sp_i] = 1.0;
species_err_sum[sp_i] += 1.0;
species_sqr_sum[sp_i] += 1.0;
c_metrics.mape[cell_i][sp_i] = 100.0;
c_metrics.rrmse[cell_i][sp_i] = 1.0;
}
} else {
double alpha = 1.0 - (sur_value / ref_value);
metrics.mape[cell_i][sp_i] = 100.0 * std::abs(alpha);
metrics.rrmse[cell_i][sp_i] = alpha * alpha;
species_err_sum[sp_i] += std::abs(alpha);
species_sqr_sum[sp_i] += alpha * alpha;
c_metrics.mape[cell_i][sp_i] = 100.0 * std::abs(alpha);
c_metrics.rrmse[cell_i][sp_i] = alpha * alpha;
}
}
}
metrics_history.push_back(metrics);
for (size_t sp_i = 2; sp_i < n_species; sp_i++) {
s_metrics.mape[sp_i] = 100.0 * (species_err_sum[sp_i] / size_per_prop);
s_metrics.rrmse[sp_i] = std::sqrt(species_sqr_sum[sp_i] / size_per_prop);
}
metrics_history.push_back(s_metrics);
cell_metrics_history.push_back(c_metrics);
}
void poet::ControlModule::processCheckpoint(

View File

@ -3,8 +3,8 @@
#include "Base/Macros.hpp"
#include "Chemistry/ChemistryModule.hpp"
#include "Transport/DiffusionModule.hpp"
#include "IO/HDF5Functions.hpp"
#include "Transport/DiffusionModule.hpp"
#include "poet.hpp"
#include <cstdint>
@ -24,21 +24,30 @@ struct ControlConfig {
std::vector<double> mape_threshold;
};
struct SpeciesErrorMetrics {
struct CellErrorMetrics {
std::vector<std::uint32_t> id;
std::vector<std::vector<double>> mape;
std::vector<std::vector<double>> rrmse;
uint32_t iteration = 0;
uint32_t rollback_count = 0;
SpeciesErrorMetrics(uint32_t n_cells, uint32_t n_species, uint32_t iter,
uint32_t rb_count)
CellErrorMetrics(uint32_t n_cells, uint32_t n_species, uint32_t iter,
uint32_t rb_count)
: mape(n_cells, std::vector<double>(n_species, 0.0)),
rrmse(n_cells, std::vector<double>(n_species, 0.0)), iteration(iter),
rollback_count(rb_count) {}
};
struct SpeciesErrorMetrics {
std::vector<double> mape;
std::vector<double> rrmse;
uint32_t iteration = 0;
uint32_t rollback_count = 0;
SpeciesErrorMetrics(uint32_t n_species, uint32_t iter, uint32_t rb_count)
: mape(n_species, 0.0), rrmse(n_species, 0.0), iteration(iter),
rollback_count(rb_count) {}
};
class ControlModule {
@ -55,7 +64,8 @@ public:
void computeErrorMetrics(std::vector<std::vector<double>> &reference_values,
std::vector<std::vector<double>> &surrogate_values,
const std::vector<std::string> &species);
const std::vector<std::string> &species,
const uint32_t size_per_prop);
void processCheckpoint(DiffusionModule &diffusion, uint32_t &current_iter,
const std::string &out_dir,
@ -64,7 +74,6 @@ public:
std::optional<uint32_t>
getRollbackTarget(const std::vector<std::string> &species);
bool shouldBcastFlags();
bool getFlushRequest() const { return flush_request; }
@ -112,6 +121,7 @@ private:
bool bcast_flags = false;
std::vector<CellErrorMetrics> cell_metrics_history;
std::vector<SpeciesErrorMetrics> metrics_history;
double prep_t = 0.;

View File

@ -6,6 +6,7 @@
namespace poet {
struct SpeciesErrorMetrics;
struct CellErrorMetrics;
}
int write_checkpoint(const std::string &dir_path, const std::string &file_name,
@ -14,6 +15,6 @@ int write_checkpoint(const std::string &dir_path, const std::string &file_name,
int read_checkpoint(const std::string &dir_path, const std::string &file_name,
struct Checkpoint_s &checkpoint);
int write_metrics(const std::vector<poet::SpeciesErrorMetrics> &metrics_history,
const std::vector<std::string> &species_names,
const std::string &dir_path, const std::string &file_name);
int write_metrics(const std::vector<poet::CellErrorMetrics> &metrics_history,
const std::vector<std::string> &species_names,
const std::string &dir_path, const std::string &file_name);

View File

@ -13,7 +13,7 @@
namespace fs = std::filesystem;
int write_metrics(const std::vector<poet::SpeciesErrorMetrics> &metrics_history,
int write_metrics(const std::vector<poet::CellErrorMetrics> &metrics_history,
const std::vector<std::string> &species_names,
const std::string &dir_path, const std::string &file_name) {
@ -27,73 +27,97 @@ int write_metrics(const std::vector<poet::SpeciesErrorMetrics> &metrics_history,
for (size_t idx = 0; idx < metrics_history.size(); ++idx) {
const auto &metrics = metrics_history[idx];
std::string grp = "iter_" + std::to_string(metrics.iteration) + "_" +
std::string grp = "entry_" + std::to_string(idx) + "_iter_" +
std::to_string(metrics.iteration) + "_rb_" +
std::to_string(metrics.rollback_count);
size_t n_cells = metrics.id.size();
size_t n_species = metrics.mape[0].size();
H5Easy::dump(file, grp + "/meta", 0);
H5Easy::dump(file, grp + "/meta", 0, H5Easy::DumpMode::Overwrite);
// Attach attributes
H5Easy::dumpAttribute(file, grp + "/meta", "species_names", species_names);
H5Easy::dumpAttribute(file, grp + "/meta", "iteration", metrics.iteration);
H5Easy::dumpAttribute(file, grp + "/meta", "species_names", species_names, H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "iteration", metrics.iteration, H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "rollback_count",
metrics.rollback_count);
H5Easy::dumpAttribute(file, grp + "/meta", "n_cells", n_cells);
H5Easy::dumpAttribute(file, grp + "/meta", "n_species", n_species);
metrics.rollback_count, H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "n_cells", n_cells, H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "n_species", n_species, H5Easy::DumpMode::Overwrite);
// ─────────────────────────────────────────────
// 2. Real datasets
// ─────────────────────────────────────────────
H5Easy::dump(file, grp + "/cell_id", metrics.id);
H5Easy::dump(file, grp + "/mape", metrics.mape);
H5Easy::dump(file, grp + "/rrmse", metrics.rrmse);
H5Easy::dump(file, grp + "/mape", metrics.mape, H5Easy::DumpMode::Overwrite);
H5Easy::dump(file, grp + "/rrmse", metrics.rrmse, H5Easy::DumpMode::Overwrite);
}
return 0;
}
void writeStatsToCSV(const std::vector<poet::SpeciesErrorMetrics> &all_stats,
const std::vector<std::string> &species_names,
const std::string &out_dir, const std::string &filename) {
std::filesystem::path full_path = std::filesystem::path(out_dir) / filename;
std::ofstream out(full_path);
void writeCellStatsToCSV(const std::vector<poet::CellErrorMetrics> &all_stats,
const std::vector<std::string> &species_names,
const std::string &out_dir,
const std::string &filename) {
std::ofstream out(std::filesystem::path(out_dir) / filename);
if (!out.is_open()) {
std::cerr << "Could not open " << filename << " !" << std::endl;
return;
}
// header: CellID, Iteration, Rollback, Species, MAPE, RRMSE
// Header
out << std::left << std::setw(15) << "CellID" << std::setw(15) << "Iteration"
<< std::setw(15) << "Rollback" << std::setw(15) << "Species"
<< std::setw(15) << "MAPE" << std::setw(15) << "RRMSE"
<< "\n";
<< "\n"
<< std::string(90, '-') << "\n";
out << std::string(90, '-') << "\n";
// data rows: iterate over iterations
for (size_t iter_idx = 0; iter_idx < all_stats.size(); ++iter_idx) {
const auto &metrics = all_stats[iter_idx];
// Iterate over cells
// Data rows
for (const auto &metrics : all_stats) {
for (size_t cell_idx = 0; cell_idx < metrics.id.size(); ++cell_idx) {
// Iterate over species for this cell
for (size_t species_idx = 0; species_idx < species_names.size();
++species_idx) {
for (size_t sp_idx = 0; sp_idx < species_names.size(); ++sp_idx) {
out << std::left << std::setw(15) << metrics.id[cell_idx]
<< std::setw(15) << metrics.iteration << std::setw(15)
<< metrics.rollback_count << std::setw(15)
<< species_names[species_idx] << std::setw(15)
<< metrics.mape[cell_idx][species_idx] << std::setw(15)
<< metrics.rrmse[cell_idx][species_idx] << "\n";
<< species_names[sp_idx] << std::setw(15)
<< metrics.mape[cell_idx][sp_idx] << std::setw(15)
<< metrics.rrmse[cell_idx][sp_idx] << "\n";
}
}
out << "\n";
}
out.close();
std::cout << "Error metrics written to " << out_dir << "/" << filename
std::cout << "Cell error metrics written to " << out_dir << "/" << filename
<< "\n";
}
void writeSpeciesStatsToCSV(
const std::vector<poet::SpeciesErrorMetrics> &all_stats,
const std::vector<std::string> &species_names, const std::string &out_dir,
const std::string &filename) {
std::ofstream out(std::filesystem::path(out_dir) / filename);
if (!out.is_open()) {
std::cerr << "Could not open " << filename << " !" << std::endl;
return;
}
// Header
out << std::left << std::setw(15) << "Iteration" << std::setw(15)
<< "Rollback" << std::setw(15) << "Species" << std::setw(15) << "MAPE"
<< std::setw(15) << "RRMSE"
<< "\n"
<< std::string(75, '-') << "\n";
// Data rows
for (const auto &metrics : all_stats) {
for (size_t sp_idx = 0; sp_idx < species_names.size(); ++sp_idx) {
out << std::left << std::setw(15) << metrics.iteration << std::setw(15)
<< metrics.rollback_count << std::setw(15) << species_names[sp_idx]
<< std::setw(15) << metrics.mape[sp_idx] << std::setw(15)
<< metrics.rrmse[sp_idx] << "\n";
}
out << "\n";
}
out.close();
std::cout << "Species error metrics written to " << out_dir << "/" << filename
<< "\n";
}

View File

@ -1,6 +1,12 @@
#include <string>
void writeStatsToCSV(const std::vector<poet::SpeciesErrorMetrics> &all_stats,
const std::vector<std::string> &species_names,
const std::string &out_dir, const std::string &filename);
void writeSpeciesStatsToCSV(
const std::vector<poet::SpeciesErrorMetrics> &all_stats,
const std::vector<std::string> &species_names, const std::string &out_dir,
const std::string &filename);
void writeCellStatsToCSV(const std::vector<poet::CellErrorMetrics> &all_stats,
const std::vector<std::string> &species_names,
const std::string &out_dir,
const std::string &filename);
// namespace poet