mirror of
https://git.gfz-potsdam.de/naaice/poet.git
synced 2025-12-15 12:28:22 +01:00
Added time measurements
This commit is contained in:
parent
4068fc01e4
commit
4ac8914175
@ -105,10 +105,10 @@ setup <- list(
|
||||
)
|
||||
|
||||
|
||||
iterations <- 250
|
||||
iterations <- 100
|
||||
dt <- 200
|
||||
checkpoint_interval <- 50
|
||||
control_interval <- 50
|
||||
checkpoint_interval <- 20
|
||||
control_interval <- 20
|
||||
mape_threshold <- rep(3.5e-3, 13)
|
||||
#out_save <- seq(50, iterations, by = 50)
|
||||
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=dolo_warmup_debug
|
||||
#SBATCH --output=dolo_warmup_debug%j.out
|
||||
#SBATCH --error=dolo_warmup_debug%j.err
|
||||
#SBATCH --job-name=dolo
|
||||
#SBATCH --output=dolo_%j.out
|
||||
#SBATCH --error=dolo_%j.err
|
||||
#SBATCH --partition=long
|
||||
#SBATCH --nodes=4
|
||||
#SBATCH --ntasks=96
|
||||
@ -14,5 +14,5 @@ source /etc/profile.d/modules.sh
|
||||
module purge
|
||||
module load cmake gcc openmpi
|
||||
|
||||
mpirun -n 96 ./poet --interp dolo_fgcs_3.R dolo_fgcs_3.qs2 dolo_warmup_debug
|
||||
#mpirun -n 96 ./poet --interp barite_fgcs_2.R barite_fgcs_2.qs2 bar_fgcs_500_eps
|
||||
mpirun -n 96 ./poet --interp dolo_fgcs_3.R dolo_fgcs_3.qs2 dolo
|
||||
#mpirun -n 96 ./poet --interp barite_fgcs_2.R barite_fgcs_2.qs2 bar_warmup
|
||||
Binary file not shown.
Binary file not shown.
@ -174,6 +174,12 @@ public:
|
||||
*/
|
||||
auto GetMasterLoopTime() const { return this->send_recv_t; }
|
||||
|
||||
auto GetMasterRecvCtrlDataTime() const { return this->recv_ctrl_t; }
|
||||
|
||||
auto GetMasterUnshuffleTime() const { return this->shuf_t; }
|
||||
|
||||
auto GetMasterCtrlMetricsTime() const { return this->metrics_t; }
|
||||
|
||||
/**
|
||||
* **Master only** Collect and return all accumulated timings recorded by
|
||||
* workers to run Phreeqc simulation.
|
||||
@ -411,6 +417,10 @@ protected:
|
||||
double seq_t = 0.;
|
||||
double send_recv_t = 0.;
|
||||
|
||||
double recv_ctrl_t = 0.;
|
||||
double shuf_t = 0.;
|
||||
double metrics_t = 0.;
|
||||
|
||||
std::array<double, 2> base_totals{0};
|
||||
|
||||
bool print_progessbar{false};
|
||||
|
||||
@ -335,6 +335,7 @@ inline void poet::ChemistryModule::MasterRecvPkgs(worker_list_t &w_list,
|
||||
/* declare most of the variables here */
|
||||
int need_to_receive = 1;
|
||||
double idle_a, idle_b;
|
||||
double recv_ctrl_a, recv_ctrl_b;
|
||||
int p, size;
|
||||
std::vector<double> recv_buffer;
|
||||
|
||||
@ -373,6 +374,7 @@ inline void poet::ChemistryModule::MasterRecvPkgs(worker_list_t &w_list,
|
||||
break;
|
||||
}
|
||||
case LOOP_CTRL: {
|
||||
recv_ctrl_a = MPI_Wtime();
|
||||
/* layout of buffer is [phreeqc][surrogate] */
|
||||
MPI_Get_count(&probe_status, MPI_DOUBLE, &size);
|
||||
recv_buffer.resize(size);
|
||||
@ -385,6 +387,8 @@ inline void poet::ChemistryModule::MasterRecvPkgs(worker_list_t &w_list,
|
||||
|
||||
std::copy(recv_buffer.begin() + (size / 2), recv_buffer.begin() + size,
|
||||
w_list[p - 1].surrogate_addr);
|
||||
recv_ctrl_b = MPI_Wtime();
|
||||
recv_ctrl_t += recv_ctrl_b - recv_ctrl_a;
|
||||
|
||||
handled = true;
|
||||
break;
|
||||
@ -450,6 +454,7 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
|
||||
int free_workers;
|
||||
int i_pkgs;
|
||||
int ftype;
|
||||
double shuf_a, shuf_b, metrics_a, metrics_b;
|
||||
|
||||
const std::vector<uint32_t> wp_sizes_vector =
|
||||
CalculateWPSizesVector(this->n_cells, this->wp_size);
|
||||
@ -464,35 +469,6 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
|
||||
MPI_INT);
|
||||
}
|
||||
|
||||
// ftype = CHEM_IP_ENABLE;
|
||||
// ftype = CHEM_WARMUP_PHASE;
|
||||
/*
|
||||
PropagateFunctionType(ftype);
|
||||
int warmup_flag = this->warmup_enabled ? 1 : 0;
|
||||
if (warmup_flag) {
|
||||
this->interp_enabled = false;
|
||||
int interp_flag = 0;
|
||||
ChemBCast(&interp_flag, 1, MPI_INT);
|
||||
|
||||
// PropagateControlLogic(CHEM_WARMUP_PHASE, 1);
|
||||
// PropagateControlLogic(CHEM_DHT_ENABLE, 0);
|
||||
// PropagateControlLogic(CHEM_IP_ENABLE, 0);
|
||||
} else {
|
||||
this->interp_enabled = true;
|
||||
int interp_flag = 1;
|
||||
ChemBCast(&interp_flag, 1, MPI_INT);
|
||||
|
||||
// PropagateControlLogic(CHEM_WARMUP_PHASE, 0);
|
||||
// PropagateControlLogic(CHEM_DHT_ENABLE, 1);
|
||||
// PropagateControlLogic(CHEM_IP_ENABLE, 1);
|
||||
}
|
||||
|
||||
int control_flag = this->control_module->GetControlIntervalEnabled() ? 1 : 0;
|
||||
if (control_flag) {
|
||||
PropagateControlLogic(CHEM_CTRL_ENABLE, control_flag);
|
||||
}
|
||||
*/
|
||||
|
||||
ftype = CHEM_WORK_LOOP;
|
||||
PropagateFunctionType(ftype);
|
||||
|
||||
@ -509,9 +485,14 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
|
||||
shuffleField(chem_field.AsVector(), this->n_cells, this->prop_count,
|
||||
wp_sizes_vector.size());
|
||||
|
||||
control_enabled = this->control_module->GetControlIntervalEnabled() ? 1 : 0;
|
||||
control_enabled = this->control_module->getControlIntervalEnabled() ? 1 : 0;
|
||||
std::vector<double> mpi_surr_buffer{mpi_buffer};
|
||||
|
||||
std::cout << "control_enabled is " << control_enabled << ", "
|
||||
<< "warmup_enabled is " << warmup_enabled << ", "
|
||||
<< "dht_enabled is " << dht_enabled << ", "
|
||||
<< "interp_enabled is " << interp_enabled << std::endl;
|
||||
|
||||
/* setup local variables */
|
||||
pkg_to_send = wp_sizes_vector.size();
|
||||
pkg_to_recv = wp_sizes_vector.size();
|
||||
@ -569,45 +550,24 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
|
||||
std::cout << "[Master] Control logic enabled for this iteration."
|
||||
<< std::endl;
|
||||
std::vector<double> sur_unshuffled{mpi_surr_buffer};
|
||||
|
||||
shuf_a = MPI_Wtime();
|
||||
unshuffleField(mpi_surr_buffer, this->n_cells, this->prop_count,
|
||||
wp_sizes_vector.size(), sur_unshuffled);
|
||||
shuf_b = MPI_Wtime();
|
||||
this->shuf_t += shuf_b - shuf_a;
|
||||
|
||||
// Quick debug: compare out_vec vs sur_unshuffled
|
||||
size_t N = out_vec.size();
|
||||
if (N != sur_unshuffled.size()) {
|
||||
std::cerr << "[MASTER DBG] size mismatch out_vec=" << N
|
||||
<< " sur_unshuffled=" << sur_unshuffled.size() << std::endl;
|
||||
} /*else {
|
||||
double max_abs = 0.0;
|
||||
double max_rel = 0.0;
|
||||
size_t worst_i = 0;
|
||||
for (size_t i = 0; i < N; i) {
|
||||
double a = out_vec[i];
|
||||
double b = sur_unshuffled[i];
|
||||
double absd = std::fabs(a - b);
|
||||
if (absd > max_abs) {
|
||||
max_abs = absd;
|
||||
worst_i = i;
|
||||
}
|
||||
double rel = (std::fabs(a) > 1e-12) ? absd / std::fabs(a) : (absd > 0 ?
|
||||
1e12 : 0.0); if (rel > max_rel) max_rel = rel;
|
||||
}
|
||||
std::cerr << "[MASTER DBG] control compare N=" << N
|
||||
<< " max_abs=" << max_abs << " max_rel=" << max_rel
|
||||
<< " worst_idx=" << worst_i
|
||||
<< " out_vec[worst]=" << out_vec[worst_i]
|
||||
<< " sur[worst]=" << sur_unshuffled[worst_i] << std::endl;
|
||||
// optionally print first 8 entries
|
||||
std::cerr << "[MASTER DBG] out[0..7]: ";
|
||||
for (size_t i = 0; i < std::min<size_t>(8, N); i) std::cerr << out_vec[i]
|
||||
<< " "; std::cerr << "\n[MASTER DBG] sur[0..7]: "; for (size_t i = 0; i <
|
||||
std::min<size_t>(8, N); +i) std::cerr << sur_unshuffled[i] << " "; std::cerr
|
||||
<< std::endl;
|
||||
}
|
||||
*/
|
||||
|
||||
control_module->ComputeSpeciesErrorMetrics(out_vec, sur_unshuffled,
|
||||
this->n_cells);
|
||||
metrics_a = MPI_Wtime();
|
||||
control_module->computeSpeciesErrorMetrics(out_vec, sur_unshuffled,
|
||||
this->n_cells);
|
||||
metrics_b = MPI_Wtime();
|
||||
this->metrics_t += metrics_b - metrics_a;
|
||||
}
|
||||
|
||||
/* start time measurement of master chemistry */
|
||||
|
||||
@ -155,6 +155,7 @@ void poet::ChemistryModule::WorkerDoWork(MPI_Status &probe_status,
|
||||
double dht_get_start, dht_get_end;
|
||||
double phreeqc_time_start, phreeqc_time_end;
|
||||
double dht_fill_start, dht_fill_end;
|
||||
double ctrl_cp_start, ctrl_cp_end, ctrl_start, ctrl_end;
|
||||
|
||||
uint32_t iteration;
|
||||
double dt;
|
||||
@ -239,11 +240,14 @@ void poet::ChemistryModule::WorkerDoWork(MPI_Status &probe_status,
|
||||
poet::WorkPackage s_curr_wp_control = s_curr_wp;
|
||||
|
||||
if (control_enabled) {
|
||||
ctrl_cp_start = MPI_Wtime();
|
||||
for (std::size_t wp_i = 0; wp_i < s_curr_wp_control.size; wp_i++) {
|
||||
s_curr_wp_control.output[wp_i] =
|
||||
std::vector<double>(this->prop_count, 0.0);
|
||||
s_curr_wp_control.mapping[wp_i] = CHEM_PQC;
|
||||
}
|
||||
ctrl_cp_end = MPI_Wtime();
|
||||
timings.ctrl_t += ctrl_cp_end - ctrl_cp_start;
|
||||
}
|
||||
|
||||
phreeqc_time_start = MPI_Wtime();
|
||||
@ -254,7 +258,7 @@ void poet::ChemistryModule::WorkerDoWork(MPI_Status &probe_status,
|
||||
phreeqc_time_end = MPI_Wtime();
|
||||
|
||||
if (control_enabled) {
|
||||
|
||||
ctrl_start = MPI_Wtime();
|
||||
std::size_t sur_wp_offset = s_curr_wp.size * this->prop_count;
|
||||
|
||||
mpi_buffer.resize(count + sur_wp_offset);
|
||||
@ -281,7 +285,8 @@ void poet::ChemistryModule::WorkerDoWork(MPI_Status &probe_status,
|
||||
}
|
||||
}
|
||||
count += sur_wp_offset;
|
||||
|
||||
ctrl_end = MPI_Wtime();
|
||||
timings.ctrl_t += ctrl_end - ctrl_start;
|
||||
} else {
|
||||
for (std::size_t wp_i = 0; wp_i < s_curr_wp.size; wp_i++) {
|
||||
std::copy(s_curr_wp.output[wp_i].begin(), s_curr_wp.output[wp_i].end(),
|
||||
@ -302,18 +307,8 @@ void poet::ChemistryModule::WorkerDoWork(MPI_Status &probe_status,
|
||||
dht->fillDHT(control_enabled ? s_curr_wp_control : s_curr_wp);
|
||||
dht_fill_end = MPI_Wtime();
|
||||
|
||||
int filled_count = std::count(dht->getDHTResults().filledDHT.begin(),
|
||||
dht->getDHTResults().filledDHT.end(), true);
|
||||
|
||||
std::cout << "[Worker " << std::to_string(this->comm_rank)
|
||||
<< "] DHT filled entries=" << std::to_string(filled_count)
|
||||
<< std::endl;
|
||||
|
||||
if (interp_enabled || warmup_enabled) {
|
||||
interp->writePairs();
|
||||
std::cout << "[Worker " << std::to_string(this->comm_rank) << "] "
|
||||
<< "Writing pairs to PHT after iteration "
|
||||
<< std::to_string(iteration) << std::endl;
|
||||
}
|
||||
timings.dht_fill += dht_fill_end - dht_fill_start;
|
||||
}
|
||||
|
||||
@ -4,31 +4,21 @@
|
||||
#include "IO/StatsIO.hpp"
|
||||
#include <cmath>
|
||||
|
||||
void poet::ControlModule::UpdateControlIteration(const uint32_t &iter,
|
||||
void poet::ControlModule::updateControlIteration(const uint32_t &iter,
|
||||
const bool &dht_enabled,
|
||||
const bool &interp_enabled) {
|
||||
|
||||
/* dht_enabled and inter_enabled are user settings set before startig the
|
||||
* simulation*/
|
||||
double prep_a, prep_b;
|
||||
|
||||
prep_a = MPI_Wtime();
|
||||
if (control_interval == 0) {
|
||||
control_interval_enabled = false;
|
||||
return;
|
||||
}
|
||||
// InitiateWarmupPhase(dht_enabled, interp_enabled);
|
||||
global_iteration = iter;
|
||||
|
||||
if (global_iteration <= control_interval) {
|
||||
chem->SetWarmupEnabled(true);
|
||||
chem->SetDhtEnabled(false);
|
||||
chem->SetInterpEnabled(false);
|
||||
MSG("Warmup enabled until first control interval at iteration " +
|
||||
std::to_string(control_interval) + ".");
|
||||
} else {
|
||||
chem->SetWarmupEnabled(false);
|
||||
chem->SetDhtEnabled(true);
|
||||
chem->SetInterpEnabled(true);
|
||||
}
|
||||
initiateWarmupPhase(dht_enabled, interp_enabled);
|
||||
|
||||
control_interval_enabled =
|
||||
(control_interval > 0 && iter % control_interval == 0);
|
||||
@ -37,92 +27,82 @@ void poet::ControlModule::UpdateControlIteration(const uint32_t &iter,
|
||||
MSG("[Control] Control interval enabled at iteration " +
|
||||
std::to_string(iter));
|
||||
}
|
||||
prep_b = MPI_Wtime();
|
||||
this->prep_t += prep_b - prep_a;
|
||||
}
|
||||
|
||||
void poet::ControlModule::InitiateWarmupPhase(bool dht_enabled,
|
||||
void poet::ControlModule::initiateWarmupPhase(bool dht_enabled,
|
||||
bool interp_enabled) {
|
||||
|
||||
// user requested DHT/INTEP? keep them disabled but enable warmup-phase so
|
||||
// workers do prepareKeys/fillDHT/writePairs as required.
|
||||
if (global_iteration < control_interval) {
|
||||
/* warmup phase: keep dht and interp disabled,
|
||||
workers do prepareKeys/fillDHT/writePairs*/
|
||||
if (global_iteration <= control_interval || rollback_enabled) {
|
||||
chem->SetWarmupEnabled(true);
|
||||
// chem->SetDhtEnabled(false);
|
||||
// chem->SetInterpEnabled(false);
|
||||
MSG("Warmup enabled until first control interval at iteration " +
|
||||
chem->SetDhtEnabled(false);
|
||||
chem->SetInterpEnabled(false);
|
||||
MSG("Warmup enabled until next control interval at iteration " +
|
||||
std::to_string(control_interval) + ".");
|
||||
} else {
|
||||
/* after warmup phase: restore according to user's request*/
|
||||
chem->SetWarmupEnabled(false);
|
||||
// chem->SetDhtEnabled(dht_enabled);
|
||||
// chem->SetInterpEnabled(interp_enabled);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
void poet::ControlModule::beginIteration() {
|
||||
if (rollback_enabled) {
|
||||
if (sur_disabled_counter > 0) {
|
||||
sur_disabled_counter--;
|
||||
MSG("Rollback counter: " + std::to_string(sur_disabled_counter));
|
||||
} else {
|
||||
rollback_enabled = false;
|
||||
if (rollback_enabled) {
|
||||
if (sur_disabled_counter > 0) {
|
||||
--sur_disabled_counter;
|
||||
MSG("Rollback counter: " + std::to_string(sur_disabled_counter));
|
||||
} else {
|
||||
rollback_enabled = false;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
chem->SetWarmupEnabled(false);
|
||||
chem->SetDhtEnabled(dht_enabled);
|
||||
chem->SetInterpEnabled(interp_enabled);
|
||||
}
|
||||
*/
|
||||
|
||||
void poet::ControlModule::EndIteration(const uint32_t iter) {
|
||||
|
||||
void poet::ControlModule::applyControlLogic(ChemistryModule &chem,
|
||||
uint32_t &iter) {
|
||||
if (!control_interval_enabled) {
|
||||
return;
|
||||
}
|
||||
/* Writing a checkpointing */
|
||||
/* Control Logic*/
|
||||
if (!chem) {
|
||||
MSG("chem pointer is null — skipping checkpoint/stats write");
|
||||
} else {
|
||||
MSG("Writing checkpoint of iteration " + std::to_string(iter));
|
||||
write_checkpoint(out_dir, "checkpoint" + std::to_string(iter) + ".hdf5",
|
||||
{.field = chem->getField(), .iteration = iter});
|
||||
writeStatsToCSV(error_history, species_names, out_dir, "stats_overview");
|
||||
writeCheckpointAndMetrics(chem, iter);
|
||||
|
||||
// if()
|
||||
/*
|
||||
|
||||
if (triggerRollbackIfExceeded(*chem, *params, iter)) {
|
||||
rollback_enabled = true;
|
||||
rollback_counter++;
|
||||
sur_disabled_counter = control_interval;
|
||||
MSG("Interpolation disabled for the next " +
|
||||
std::to_string(control_interval) + ".");
|
||||
}
|
||||
|
||||
*/
|
||||
if (checkAndRollback(chem, iter) && rollback_count < 4) {
|
||||
rollback_enabled = true;
|
||||
rollback_count++;
|
||||
sur_disabled_counter = control_interval;
|
||||
MSG("Interpolation disabled for the next " +
|
||||
std::to_string(control_interval) + ".");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
void poet::ControlModule::BCastControlFlags() {
|
||||
int interp_flag = rollback_enabled ? 0 : 1;
|
||||
int dht_fill_flag = rollback_enabled ? 1 : 0;
|
||||
chem->ChemBCast(&interp_flag, 1, MPI_INT);
|
||||
chem->ChemBCast(&dht_fill_flag, 1, MPI_INT);
|
||||
void poet::ControlModule::writeCheckpointAndMetrics(ChemistryModule &chem,
|
||||
uint32_t iter) {
|
||||
|
||||
double w_check_a, w_check_b, stats_a, stats_b;
|
||||
MSG("Writing checkpoint of iteration " + std::to_string(iter));
|
||||
|
||||
w_check_a = MPI_Wtime();
|
||||
write_checkpoint(out_dir, "checkpoint" + std::to_string(iter) + ".hdf5",
|
||||
{.field = chem.getField(), .iteration = iter});
|
||||
w_check_b = MPI_Wtime();
|
||||
this->w_check_t += w_check_b - w_check_a;
|
||||
|
||||
stats_a = MPI_Wtime();
|
||||
writeStatsToCSV(metricsHistory, species_names, out_dir, "stats_overview");
|
||||
stats_b = MPI_Wtime();
|
||||
|
||||
this->stats_t += stats_b - stats_a;
|
||||
}
|
||||
|
||||
*/
|
||||
bool poet::ControlModule::checkAndRollback(ChemistryModule &chem,
|
||||
uint32_t &iter) {
|
||||
double r_check_a, r_check_b;
|
||||
|
||||
|
||||
bool poet::ControlModule::RollbackIfThresholdExceeded(ChemistryModule &chem) {
|
||||
|
||||
/**
|
||||
if (error_history.empty()) {
|
||||
if (metricsHistory.empty()) {
|
||||
MSG("No error history yet; skipping rollback check.");
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto &mape = error_history.back().mape;
|
||||
const auto &mape = metricsHistory.back().mape;
|
||||
|
||||
for (uint32_t i = 0; i < species_names.size(); ++i) {
|
||||
if (mape[i] == 0) {
|
||||
@ -130,34 +110,36 @@ bool poet::ControlModule::RollbackIfThresholdExceeded(ChemistryModule &chem) {
|
||||
}
|
||||
|
||||
if (mape[i] > mape_threshold[i]) {
|
||||
uint32_t rollback_iter = ((global_iteration - 1) / checkpoint_interval) *
|
||||
checkpoint_interval;
|
||||
uint32_t rollback_iter =
|
||||
((iter - 1) / checkpoint_interval) * checkpoint_interval;
|
||||
|
||||
MSG("[THRESHOLD EXCEEDED] " + species_names[i] +
|
||||
" has MAPE = " + std::to_string(mape[i]) +
|
||||
" exceeding threshold = " + std::to_string(mape_threshold[i])
|
||||
+ " → rolling back to iteration " + std::to_string(rollback_iter));
|
||||
" exceeding threshold = " + std::to_string(mape_threshold[i]) +
|
||||
" → rolling back to iteration " + std::to_string(rollback_iter));
|
||||
|
||||
r_check_a = MPI_Wtime();
|
||||
Checkpoint_s checkpoint_read{.field = chem.getField()};
|
||||
read_checkpoint(out_dir,
|
||||
"checkpoint" + std::to_string(rollback_iter) + ".hdf5",
|
||||
checkpoint_read);
|
||||
global_iteration = checkpoint_read.iteration;
|
||||
iter = checkpoint_read.iteration;
|
||||
r_check_b = MPI_Wtime();
|
||||
r_check_t += r_check_b - r_check_a;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
MSG("All species are within their MAPE thresholds.");
|
||||
|
||||
return false;
|
||||
*/
|
||||
}
|
||||
|
||||
void poet::ControlModule::ComputeSpeciesErrorMetrics(
|
||||
void poet::ControlModule::computeSpeciesErrorMetrics(
|
||||
const std::vector<double> &reference_values,
|
||||
const std::vector<double> &surrogate_values, const uint32_t size_per_prop) {
|
||||
|
||||
SimulationErrorStats species_error_stats(this->species_names.size(),
|
||||
global_iteration,
|
||||
/*rollback_counter*/ 0);
|
||||
SpeciesErrorMetrics metrics(this->species_names.size(), global_iteration,
|
||||
rollback_count);
|
||||
|
||||
if (reference_values.size() != surrogate_values.size()) {
|
||||
MSG(" Reference and surrogate vectors differ in size: " +
|
||||
@ -179,8 +161,8 @@ void poet::ControlModule::ComputeSpeciesErrorMetrics(
|
||||
double err_sum = 0.0;
|
||||
double sqr_err_sum = 0.0;
|
||||
uint32_t base_idx = i * size_per_prop;
|
||||
uint32_t nan_count = 0;
|
||||
uint32_t valid_count = 0;
|
||||
|
||||
int count = 0;
|
||||
|
||||
for (uint32_t j = 0; j < size_per_prop; ++j) {
|
||||
const double ref_value = reference_values[base_idx + j];
|
||||
@ -188,10 +170,8 @@ void poet::ControlModule::ComputeSpeciesErrorMetrics(
|
||||
const double ZERO_ABS = 1e-13;
|
||||
|
||||
if (std::isnan(ref_value) || std::isnan(sur_value)) {
|
||||
nan_count++;
|
||||
continue;
|
||||
}
|
||||
valid_count++;
|
||||
|
||||
if (std::abs(ref_value) < ZERO_ABS) {
|
||||
if (std::abs(sur_value) >= ZERO_ABS) {
|
||||
@ -206,15 +186,8 @@ void poet::ControlModule::ComputeSpeciesErrorMetrics(
|
||||
sqr_err_sum += alpha * alpha;
|
||||
}
|
||||
}
|
||||
if (valid_count > 0) {
|
||||
species_error_stats.mape[i] = 100.0 * (err_sum / valid_count);
|
||||
species_error_stats.rrmse[i] = std::sqrt(sqr_err_sum / valid_count);
|
||||
} else {
|
||||
species_error_stats.mape[i] = 0.0;
|
||||
species_error_stats.rrmse[i] = 0.0;
|
||||
std::cerr << "[CTRL WARN] no valid samples for species " << i << " ("
|
||||
<< this->species_names[i] << "), setting errors to 0\n";
|
||||
}
|
||||
metrics.mape[i] = 100.0 * (err_sum / size_per_prop);
|
||||
metrics.rrmse[i] = std::sqrt(sqr_err_sum / size_per_prop);
|
||||
}
|
||||
error_history.push_back(species_error_stats);
|
||||
metricsHistory.push_back(metrics);
|
||||
}
|
||||
|
||||
@ -22,36 +22,29 @@ public:
|
||||
// std::uint32_t sur_disabled_counter = 0;
|
||||
// std::uint32_t rollback_counter = 0;
|
||||
|
||||
void UpdateControlIteration(const uint32_t &iter, const bool &dht_enabled,
|
||||
void updateControlIteration(const uint32_t &iter, const bool &dht_enabled,
|
||||
const bool &interp_enaled);
|
||||
|
||||
void InitiateWarmupPhase(bool dht_enabled, bool interp_enabled);
|
||||
void initiateWarmupPhase(bool dht_enabled, bool interp_enabled);
|
||||
|
||||
auto GetGlobalIteration() const noexcept { return global_iteration; }
|
||||
bool checkAndRollback(ChemistryModule &chem, uint32_t &iter);
|
||||
|
||||
// void beginIteration();
|
||||
|
||||
// void BCastControlFlags();
|
||||
|
||||
bool RollbackIfThresholdExceeded(ChemistryModule &chem);
|
||||
|
||||
struct SimulationErrorStats {
|
||||
struct SpeciesErrorMetrics {
|
||||
std::vector<double> mape;
|
||||
std::vector<double> rrmse;
|
||||
uint32_t iteration; // iterations in simulation after rollbacks
|
||||
uint32_t rollback_count;
|
||||
|
||||
SimulationErrorStats(uint32_t species_count, uint32_t iter,
|
||||
uint32_t counter)
|
||||
SpeciesErrorMetrics(uint32_t species_count, uint32_t iter, uint32_t counter)
|
||||
: mape(species_count, 0.0), rrmse(species_count, 0.0), iteration(iter),
|
||||
rollback_count(counter) {}
|
||||
};
|
||||
|
||||
void ComputeSpeciesErrorMetrics(const std::vector<double> &reference_values,
|
||||
const std::vector<double> &surrogate_values,
|
||||
const uint32_t size_per_prop);
|
||||
void computeSpeciesErrorMetrics(const std::vector<double> &reference_values,
|
||||
const std::vector<double> &surrogate_values,
|
||||
const uint32_t size_per_prop);
|
||||
|
||||
std::vector<SimulationErrorStats> error_history;
|
||||
std::vector<SpeciesErrorMetrics> metricsHistory;
|
||||
|
||||
struct ControlSetup {
|
||||
std::string out_dir;
|
||||
@ -61,7 +54,7 @@ public:
|
||||
std::vector<double> mape_threshold;
|
||||
};
|
||||
|
||||
void EnableControlLogic(const ControlSetup &setup) {
|
||||
void enableControlLogic(const ControlSetup &setup) {
|
||||
this->out_dir = setup.out_dir;
|
||||
this->checkpoint_interval = setup.checkpoint_interval;
|
||||
this->control_interval = setup.control_interval;
|
||||
@ -69,24 +62,31 @@ public:
|
||||
this->mape_threshold = setup.mape_threshold;
|
||||
}
|
||||
|
||||
bool GetControlIntervalEnabled() const {
|
||||
bool getControlIntervalEnabled() const {
|
||||
return this->control_interval_enabled;
|
||||
}
|
||||
|
||||
void EndIteration(const uint32_t iter);
|
||||
void applyControlLogic(ChemistryModule &chem, uint32_t &iter);
|
||||
|
||||
void SetChemistryModule(poet::ChemistryModule *c) { chem = c; }
|
||||
void writeCheckpointAndMetrics(ChemistryModule &chem, uint32_t iter);
|
||||
|
||||
auto GetControlInterval() const { return this->control_interval; }
|
||||
auto getGlobalIteration() const noexcept { return global_iteration; }
|
||||
|
||||
std::vector<double> GetMapeThreshold() const { return this->mape_threshold; }
|
||||
void setChemistryModule(poet::ChemistryModule *c) { chem = c; }
|
||||
|
||||
auto getControlInterval() const { return this->control_interval; }
|
||||
|
||||
std::vector<double> getMapeThreshold() const { return this->mape_threshold; }
|
||||
|
||||
/* Profiling getters */
|
||||
auto GetMasterCtrlLogicTime() const { return this->ctrl_time; }
|
||||
|
||||
auto GetMasterCtrlBcastTime() const { return this->bcast_ctrl_time; }
|
||||
auto getUpdateCtrlLogicTime() const { return this->prep_t; }
|
||||
|
||||
auto GetMasterRecvCtrlLogicTime() const { return this->recv_ctrl_time; }
|
||||
auto getWriteCheckpointTime() const { return this->w_check_t; }
|
||||
|
||||
auto getReadCheckpointTime() const { return this->r_check_t; }
|
||||
|
||||
auto getWriteMetricsTime() const { return this->stats_t; }
|
||||
|
||||
private:
|
||||
bool rollback_enabled = false;
|
||||
@ -97,14 +97,17 @@ private:
|
||||
std::uint32_t checkpoint_interval = 0;
|
||||
std::uint32_t control_interval = 0;
|
||||
std::uint32_t global_iteration = 0;
|
||||
std::uint32_t rollback_count = 0;
|
||||
std::uint32_t sur_disabled_counter = 0;
|
||||
std::vector<double> mape_threshold;
|
||||
|
||||
std::vector<std::string> species_names;
|
||||
std::string out_dir;
|
||||
|
||||
double ctrl_time = 0.0;
|
||||
double bcast_ctrl_time = 0.0;
|
||||
double recv_ctrl_time = 0.0;
|
||||
double prep_t = 0.;
|
||||
double r_check_t = 0.;
|
||||
double w_check_t = 0.;
|
||||
double stats_t = 0.;
|
||||
|
||||
/* Buffer for shuffled surrogate data */
|
||||
std::vector<double> sur_shuffled;
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
|
||||
namespace poet
|
||||
{
|
||||
void writeStatsToCSV(const std::vector<ControlModule::SimulationErrorStats> &all_stats,
|
||||
void writeStatsToCSV(const std::vector<ControlModule::SpeciesErrorMetrics> &all_stats,
|
||||
const std::vector<std::string> &species_names,
|
||||
const std::string &out_dir,
|
||||
const std::string &filename)
|
||||
@ -47,7 +47,7 @@ namespace poet
|
||||
}
|
||||
|
||||
out.close();
|
||||
std::cout << "Stats written to " << filename << "\n";
|
||||
std::cout << "Error metrics written to " << out_dir << "/" << filename << "\n";
|
||||
}
|
||||
}
|
||||
// namespace poet
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
namespace poet
|
||||
{
|
||||
void writeStatsToCSV(const std::vector<ControlModule::SimulationErrorStats> &all_stats,
|
||||
void writeStatsToCSV(const std::vector<ControlModule::SpeciesErrorMetrics> &all_stats,
|
||||
const std::vector<std::string> &species_names,
|
||||
const std::string &out_dir,
|
||||
const std::string &filename);
|
||||
|
||||
69
src/poet.cpp
69
src/poet.cpp
@ -300,23 +300,7 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters ¶ms,
|
||||
double dSimTime{0};
|
||||
|
||||
for (uint32_t iter = 1; iter < maxiter + 1; iter++) {
|
||||
// Rollback countdowm
|
||||
|
||||
/*
|
||||
if (params.rollback_enabled) {
|
||||
if (params.sur_disabled_counter > 0) {
|
||||
--params.sur_disabled_counter;
|
||||
MSG("Rollback counter: " + std::to_string(params.sur_disabled_counter));
|
||||
} else {
|
||||
params.rollback_enabled = false;
|
||||
}
|
||||
}
|
||||
*/
|
||||
//control.beginIteration(iter);
|
||||
|
||||
// params.global_iter = iter;
|
||||
control.UpdateControlIteration(iter, params.use_dht, params.use_interp);
|
||||
// params.control_interval_enabled = (iter % params.control_interval == 0);
|
||||
control.updateControlIteration(iter, params.use_dht, params.use_interp);
|
||||
|
||||
double start_t = MPI_Wtime();
|
||||
|
||||
@ -333,8 +317,6 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters ¶ms,
|
||||
/* run transport */
|
||||
diffusion.simulate(dt);
|
||||
|
||||
// chem.runtime_params = ¶ms;
|
||||
|
||||
chem.getField().update(diffusion.getField());
|
||||
|
||||
// MSG("Chemistry start");
|
||||
@ -428,33 +410,7 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters ¶ms,
|
||||
MSG("End of *coupling* iteration " + std::to_string(iter) + "/" +
|
||||
std::to_string(maxiter));
|
||||
|
||||
control.EndIteration(iter);
|
||||
/*
|
||||
if (iter % params.checkpoint_interval == 0) {
|
||||
MSG("Writing checkpoint of iteration " + std::to_string(iter));
|
||||
write_checkpoint(params.out_dir,
|
||||
"checkpoint" + std::to_string(iter) + ".hdf5",
|
||||
{.field = chem.getField(), .iteration = iter});
|
||||
}
|
||||
|
||||
|
||||
if (params.control_interval_enabled && !params.rollback_enabled) {
|
||||
writeStatsToCSV(chem.error_history, chem.getField().GetProps(),
|
||||
params.out_dir, "stats_overview");
|
||||
|
||||
if (triggerRollbackIfExceeded(chem, params, iter)) {
|
||||
params.rollback_enabled = true;
|
||||
params.rollback_counter++;
|
||||
params.sur_disabled_counter = params.control_interval;
|
||||
MSG("Interpolation disabled for the next " +
|
||||
std::to_string(params.control_interval) + ".");
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
|
||||
|
||||
control.applyControlLogic(chem, iter);
|
||||
// MSG();
|
||||
} // END SIMULATION LOOP
|
||||
|
||||
@ -471,14 +427,17 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters ¶ms,
|
||||
Rcpp::List diffusion_profiling;
|
||||
diffusion_profiling["simtime"] = diffusion.getTransportTime();
|
||||
|
||||
/*Rcpp::List ctrl_profiling;
|
||||
ctrl_profiling["checkpointing_time"] = chkTime;
|
||||
ctrl_profiling["ctrl_logic_master"] = chem.GetMasterCtrlLogicTime();
|
||||
ctrl_profiling["bcast_ctrl_logic_master"] = chem.GetMasterCtrlBcastTime();
|
||||
ctrl_profiling["recv_ctrl_logic_maser"] = chem.GetMasterRecvCtrlLogicTime();
|
||||
ctrl_profiling["ctrl_logic_worker"] =
|
||||
Rcpp::List ctrl_profiling;
|
||||
ctrl_profiling["compute_metrics_master"] = chem.GetMasterCtrlMetricsTime();
|
||||
ctrl_profiling["unshuffle_field_master"] = chem.GetMasterUnshuffleTime();
|
||||
ctrl_profiling["w_checkpoint_master"] = control.getWriteCheckpointTime();
|
||||
ctrl_profiling["r_checkpoint_master"] = control.getReadCheckpointTime();
|
||||
ctrl_profiling["write_stats"] = control.getWriteMetricsTime();
|
||||
ctrl_profiling["ctrl_logic_master"] = control.getUpdateCtrlLogicTime();
|
||||
ctrl_profiling["recv_data_master"] = chem.GetMasterRecvCtrlDataTime();
|
||||
ctrl_profiling["worker"] =
|
||||
Rcpp::wrap(chem.GetWorkerControlTimings());
|
||||
*/
|
||||
|
||||
|
||||
if (params.use_dht) {
|
||||
chem_profiling["dht_hits"] = Rcpp::wrap(chem.GetWorkerDHTHits());
|
||||
@ -651,7 +610,7 @@ int main(int argc, char *argv[]) {
|
||||
|
||||
ControlModule control;
|
||||
chemistry.SetControlModule(&control);
|
||||
control.SetChemistryModule(&chemistry);
|
||||
control.setChemistryModule(&chemistry);
|
||||
|
||||
const ChemistryModule::SurrogateSetup surr_setup = {
|
||||
getSpeciesNames(init_list.getInitialGrid(), 0, MPI_COMM_WORLD),
|
||||
@ -676,7 +635,7 @@ int main(int argc, char *argv[]) {
|
||||
getSpeciesNames(init_list.getInitialGrid(), 0, MPI_COMM_WORLD),
|
||||
run_params.mape_threshold};
|
||||
|
||||
control.EnableControlLogic(ctrl_setup);
|
||||
control.enableControlLogic(ctrl_setup);
|
||||
|
||||
if (MY_RANK > 0) {
|
||||
chemistry.WorkerLoop();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user