Wp data not being shuffled correctly.

This commit is contained in:
rastogi 2025-10-23 23:16:58 +02:00 committed by Max Lübke
parent 71269166ea
commit dc940b2f88
7 changed files with 632 additions and 708 deletions

View File

@ -2,15 +2,16 @@
#ifndef CHEMISTRYMODULE_H_ #ifndef CHEMISTRYMODULE_H_
#define CHEMISTRYMODULE_H_ #define CHEMISTRYMODULE_H_
#include "ChemistryDefs.hpp"
#include "DataStructures/Field.hpp" #include "DataStructures/Field.hpp"
#include "DataStructures/NamedVector.hpp" #include "DataStructures/NamedVector.hpp"
#include "ChemistryDefs.hpp" #include "ChemistryDefs.hpp"
#include "Control/ControlModule.hpp" #include "Control/ControlModule.hpp"
#include "Init/InitialList.hpp" #include "Init/InitialList.hpp"
#include "NameDouble.h" #include "NameDouble.h"
#include "PhreeqcRunner.hpp"
#include "SurrogateModels/DHT_Wrapper.hpp" #include "SurrogateModels/DHT_Wrapper.hpp"
#include "SurrogateModels/Interpolation.hpp" #include "SurrogateModels/Interpolation.hpp"
#include "PhreeqcRunner.hpp"
#include <array> #include <array>
#include <cstdint> #include <cstdint>
@ -174,12 +175,6 @@ public:
*/ */
auto GetMasterLoopTime() const { return this->send_recv_t; } auto GetMasterLoopTime() const { return this->send_recv_t; }
auto GetMasterCtrlLogicTime() const { return this->ctrl_t; }
auto GetMasterCtrlBcastTime() const { return this->bcast_ctrl_t; }
auto GetMasterRecvCtrlLogicTime() const { return this->recv_ctrl_t; }
/** /**
* **Master only** Collect and return all accumulated timings recorded by * **Master only** Collect and return all accumulated timings recorded by
* workers to run Phreeqc simulation. * workers to run Phreeqc simulation.
@ -257,6 +252,8 @@ public:
std::vector<int> ai_surrogate_validity_vector; std::vector<int> ai_surrogate_validity_vector;
void setControlModule(poet::ControlModule *ctrl) { control_module = ctrl; }
protected: protected:
void initializeDHT(uint32_t size_mb, void initializeDHT(uint32_t size_mb,
const NamedVector<std::uint32_t> &key_species, const NamedVector<std::uint32_t> &key_species,
@ -274,7 +271,8 @@ protected:
CHEM_DHT_SIGNIF_VEC, CHEM_DHT_SIGNIF_VEC,
CHEM_DHT_SNAPS, CHEM_DHT_SNAPS,
CHEM_DHT_READ_FILE, CHEM_DHT_READ_FILE,
CHEM_IP, // Control Flag //CHEM_IP, // Control flag
CHEM_CTRL, // Control flag
CHEM_IP_ENABLE, CHEM_IP_ENABLE,
CHEM_IP_MIN_ENTRIES, CHEM_IP_MIN_ENTRIES,
CHEM_IP_SIGNIF_VEC, CHEM_IP_SIGNIF_VEC,
@ -329,7 +327,7 @@ protected:
void MasterSendPkgs(worker_list_t &w_list, workpointer_t &work_pointer, void MasterSendPkgs(worker_list_t &w_list, workpointer_t &work_pointer,
workpointer_t &sur_pointer, int &pkg_to_send, workpointer_t &sur_pointer, int &pkg_to_send,
int &count_pkgs, int &free_workers, double dt, int &count_pkgs, int &free_workers, double dt,
uint32_t iteration, uint32_t control_iteration, uint32_t iteration,
const std::vector<uint32_t> &wp_sizes_vector); const std::vector<uint32_t> &wp_sizes_vector);
void MasterRecvPkgs(worker_list_t &w_list, int &pkg_to_recv, bool to_send, void MasterRecvPkgs(worker_list_t &w_list, int &pkg_to_recv, bool to_send,
int &free_workers); int &free_workers);
@ -367,6 +365,10 @@ protected:
void BCastStringVec(std::vector<std::string> &io); void BCastStringVec(std::vector<std::string> &io);
int packResultsIntoBuffer(std::vector<double> &mpi_buffer, int base_count,
const WorkPackage &wp,
const WorkPackage &wp_control);
int comm_size, comm_rank; int comm_size, comm_rank;
MPI_Comm group_comm; MPI_Comm group_comm;
@ -380,13 +382,12 @@ protected:
poet::DHT_Wrapper *dht = nullptr; poet::DHT_Wrapper *dht = nullptr;
bool dht_fill_during_rollback{false};
bool interp_enabled{false}; bool interp_enabled{false};
std::unique_ptr<poet::InterpolationModule> interp; std::unique_ptr<poet::InterpolationModule> interp;
bool ai_surrogate_enabled{false}; bool ai_surrogate_enabled{false};
static constexpr uint32_t BUFFER_OFFSET = 6; static constexpr uint32_t BUFFER_OFFSET = 5;
inline void ChemBCast(void *buf, int count, MPI_Datatype datatype) const { inline void ChemBCast(void *buf, int count, MPI_Datatype datatype) const {
MPI_Bcast(buf, count, datatype, 0, this->group_comm); MPI_Bcast(buf, count, datatype, 0, this->group_comm);
@ -400,10 +401,6 @@ protected:
double seq_t = 0.; double seq_t = 0.;
double send_recv_t = 0.; double send_recv_t = 0.;
double ctrl_t = 0.;
double bcast_ctrl_t = 0.;
double recv_ctrl_t = 0.;
std::array<double, 2> base_totals{0}; std::array<double, 2> base_totals{0};
bool print_progessbar{false}; bool print_progessbar{false};
@ -422,9 +419,11 @@ protected:
std::unique_ptr<PhreeqcRunner> pqc_runner; std::unique_ptr<PhreeqcRunner> pqc_runner;
std::unique_ptr<poet::ControlModule> ctrl_module; poet::ControlModule *control_module = nullptr;
//std::vector<double> sur_shuffled; bool control_enabled{false};
// std::vector<double> sur_shuffled;
}; };
} // namespace poet } // namespace poet

View File

@ -235,7 +235,7 @@ inline void printProgressbar(int count_pkgs, int n_wp, int barWidth = 70) {
inline void poet::ChemistryModule::MasterSendPkgs( inline void poet::ChemistryModule::MasterSendPkgs(
worker_list_t &w_list, workpointer_t &work_pointer, worker_list_t &w_list, workpointer_t &work_pointer,
workpointer_t &sur_pointer, int &pkg_to_send, int &count_pkgs, workpointer_t &sur_pointer, int &pkg_to_send, int &count_pkgs,
int &free_workers, double dt, uint32_t iteration, uint32_t control_interval, int &free_workers, double dt, uint32_t iteration,
const std::vector<uint32_t> &wp_sizes_vector) { const std::vector<uint32_t> &wp_sizes_vector) {
/* declare variables */ /* declare variables */
int local_work_package_size; int local_work_package_size;
@ -276,8 +276,6 @@ inline void poet::ChemistryModule::MasterSendPkgs(
std::accumulate(wp_sizes_vector.begin(), std::accumulate(wp_sizes_vector.begin(),
std::next(wp_sizes_vector.begin(), count_pkgs), 0); std::next(wp_sizes_vector.begin(), count_pkgs), 0);
send_buffer[end_of_wp + 4] = wp_start_index; send_buffer[end_of_wp + 4] = wp_start_index;
// whether this iteration is a control iteration
send_buffer[end_of_wp + 5] = control_interval;
/* ATTENTION Worker p has rank p+1 */ /* ATTENTION Worker p has rank p+1 */
// MPI_Send(send_buffer, end_of_wp + BUFFER_OFFSET, MPI_DOUBLE, p + 1, // MPI_Send(send_buffer, end_of_wp + BUFFER_OFFSET, MPI_DOUBLE, p + 1,
@ -285,6 +283,17 @@ inline void poet::ChemistryModule::MasterSendPkgs(
MPI_Send(send_buffer.data(), send_buffer.size(), MPI_DOUBLE, p + 1, MPI_Send(send_buffer.data(), send_buffer.size(), MPI_DOUBLE, p + 1,
LOOP_WORK, this->group_comm); LOOP_WORK, this->group_comm);
/* ---- DEBUG LOG (Sender side) ---- */
std::cout << "[DEBUG][rank=" << p+1
<< "] sending WP " << (count_pkgs - 1)
<< " to worker rank " << (p + 1)
<< " | len=" << send_buffer.size()
<< " | start index=" << wp_start_index
<< " | second element=" << send_buffer[1]
<< " | pkg size=" << local_work_package_size
<< std::endl;
/* -------------------------------- */
/* Mark that worker has work to do */ /* Mark that worker has work to do */
w_list[p].has_work = 1; w_list[p].has_work = 1;
free_workers--; free_workers--;
@ -301,8 +310,9 @@ inline void poet::ChemistryModule::MasterRecvPkgs(worker_list_t &w_list,
int need_to_receive = 1; int need_to_receive = 1;
double idle_a, idle_b; double idle_a, idle_b;
int p, size; int p, size;
double recv_a, recv_b; std::vector<double> recv_buffer;
recv_buffer.reserve(wp_size * prop_count * 2);
MPI_Status probe_status; MPI_Status probe_status;
// master_recv_a = MPI_Wtime(); // master_recv_a = MPI_Wtime();
/* start to loop as long there are packages to recv and the need to receive /* start to loop as long there are packages to recv and the need to receive
@ -320,41 +330,48 @@ inline void poet::ChemistryModule::MasterRecvPkgs(worker_list_t &w_list,
idle_b = MPI_Wtime(); idle_b = MPI_Wtime();
this->idle_t += idle_b - idle_a; this->idle_t += idle_b - idle_a;
} }
if (!need_to_receive) {
continue;
}
/* if need_to_receive was set to true above, so there is a message to /* if need_to_receive was set to true above, so there is a message to
* receive */ * receive */
if (need_to_receive) { p = probe_status.MPI_SOURCE;
p = probe_status.MPI_SOURCE; bool handled = false;
if (probe_status.MPI_TAG == LOOP_WORK) {
MPI_Get_count(&probe_status, MPI_DOUBLE, &size);
MPI_Recv(w_list[p - 1].send_addr, size, MPI_DOUBLE, p, LOOP_WORK,
this->group_comm, MPI_STATUS_IGNORE);
w_list[p - 1].has_work = 0;
pkg_to_recv -= 1;
free_workers++;
}
if (probe_status.MPI_TAG == LOOP_CTRL) {
recv_a = MPI_Wtime();
MPI_Get_count(&probe_status, MPI_DOUBLE, &size);
// layout of buffer is [phreeqc][surrogate] switch (probe_status.MPI_TAG) {
std::vector<double> recv_buffer(size); case LOOP_WORK: {
MPI_Get_count(&probe_status, MPI_DOUBLE, &size);
MPI_Recv(w_list[p - 1].send_addr, size, MPI_DOUBLE, p, LOOP_WORK,
this->group_comm, MPI_STATUS_IGNORE);
handled = true;
break;
}
case LOOP_CTRL: {
/* layout of buffer is [phreeqc][surrogate] */
MPI_Get_count(&probe_status, MPI_DOUBLE, &size);
recv_buffer.resize(size);
MPI_Recv(recv_buffer.data(), size, MPI_DOUBLE, p, LOOP_CTRL,
this->group_comm, MPI_STATUS_IGNORE);
MPI_Recv(recv_buffer.data(), size, MPI_DOUBLE, p, LOOP_CTRL, int half = size / 2;
this->group_comm, MPI_STATUS_IGNORE); std::copy(recv_buffer.begin(), recv_buffer.begin() + half,
w_list[p - 1].send_addr);
std::copy(recv_buffer.begin(), recv_buffer.begin() + (size / 2), std::copy(recv_buffer.begin() + (size / 2), recv_buffer.begin() + size,
w_list[p - 1].send_addr); w_list[p - 1].surrogate_addr);
std::copy(recv_buffer.begin() + (size / 2), recv_buffer.begin() + size, handled = true;
w_list[p - 1].surrogate_addr); break;
}
w_list[p - 1].has_work = 0; default: {
pkg_to_recv -= 1; throw std::runtime_error("Master received unknown MPI tag: " +
free_workers++; std::to_string(probe_status.MPI_TAG));
recv_b = MPI_Wtime(); }
this->recv_ctrl_t += recv_b - recv_a; }
} if (handled) {
w_list[p - 1].has_work = 0;
pkg_to_recv -= 1;
free_workers++;
} }
} }
} }
@ -408,10 +425,6 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
int i_pkgs; int i_pkgs;
int ftype; int ftype;
double ctrl_a, ctrl_b;
double worker_ctrl_a, worker_ctrl_b;
double ctrl_bcast_a, ctrl_bcast_b;
const std::vector<uint32_t> wp_sizes_vector = const std::vector<uint32_t> wp_sizes_vector =
CalculateWPSizesVector(this->n_cells, this->wp_size); CalculateWPSizesVector(this->n_cells, this->wp_size);
@ -425,15 +438,18 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
MPI_INT); MPI_INT);
} }
/* start time measurement of broadcasting interpolation status */ uint32_t control_flag = control_module->GetControlIntervalEnabled();
ctrl_bcast_a = MPI_Wtime(); if (control_flag) {
ftype = CHEM_CTRL;
PropagateFunctionType(ftype);
ChemBCast(&control_flag, 1, MPI_INT);
}
/*
ftype = CHEM_IP; ftype = CHEM_IP;
PropagateFunctionType(ftype); PropagateFunctionType(ftype);
ctrl_module->BCastControlFlags(); ctrl_module->BCastControlFlags();
/* end time measurement of broadcasting interpolation status */ */
ctrl_bcast_b = MPI_Wtime();
this->bcast_ctrl_t += ctrl_bcast_b - ctrl_bcast_a;
ftype = CHEM_WORK_LOOP; ftype = CHEM_WORK_LOOP;
PropagateFunctionType(ftype); PropagateFunctionType(ftype);
@ -441,32 +457,23 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
static uint32_t iteration = 0; static uint32_t iteration = 0;
uint32_t control_logic_enabled =
ctrl_module->control_interval_enabled ? 1 : 0;
if (control_logic_enabled) {
ctrl_module->sur_shuffled.clear();
ctrl_module->sur_shuffled.reserve(this->n_cells * this->prop_count);
}
/* start time measurement of sequential part */ /* start time measurement of sequential part */
seq_a = MPI_Wtime(); seq_a = MPI_Wtime();
/* shuffle grid */ /* shuffle grid */
// grid.shuffleAndExport(mpi_buffer); // grid.shuffleAndExport(mpi_buffer);
std::vector<double> mpi_buffer = std::vector<double> mpi_buffer =
shuffleField(chem_field.AsVector(), this->n_cells, this->prop_count, shuffleField(chem_field.AsVector(), this->n_cells, this->prop_count,
wp_sizes_vector.size()); wp_sizes_vector.size());
ctrl_module->sur_shuffled.resize(mpi_buffer.size()); std::vector<double> mpi_surr_buffer{mpi_buffer};
/* setup local variables */ /* setup local variables */
pkg_to_send = wp_sizes_vector.size(); pkg_to_send = wp_sizes_vector.size();
pkg_to_recv = wp_sizes_vector.size(); pkg_to_recv = wp_sizes_vector.size();
workpointer_t work_pointer = mpi_buffer.begin(); workpointer_t work_pointer = mpi_buffer.begin();
workpointer_t sur_pointer = ctrl_module->sur_shuffled.begin(); workpointer_t sur_pointer = mpi_surr_buffer.begin();
worker_list_t worker_list(this->comm_size - 1); worker_list_t worker_list(this->comm_size - 1);
free_workers = this->comm_size - 1; free_workers = this->comm_size - 1;
@ -490,8 +497,7 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
if (pkg_to_send > 0) { if (pkg_to_send > 0) {
// send packages to all free workers ... // send packages to all free workers ...
MasterSendPkgs(worker_list, work_pointer, sur_pointer, pkg_to_send, MasterSendPkgs(worker_list, work_pointer, sur_pointer, pkg_to_send,
i_pkgs, free_workers, dt, iteration, control_logic_enabled, i_pkgs, free_workers, dt, iteration, wp_sizes_vector);
wp_sizes_vector);
} }
// ... and try to receive them from workers who has finished their work // ... and try to receive them from workers who has finished their work
MasterRecvPkgs(worker_list, pkg_to_recv, pkg_to_send > 0, free_workers); MasterRecvPkgs(worker_list, pkg_to_recv, pkg_to_send > 0, free_workers);
@ -516,22 +522,17 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
/* do master stuff */ /* do master stuff */
/* start time measurement of control logic */ if (control_flag) {
ctrl_a = MPI_Wtime(); std::cout << "[Master] Control logic enabled for this iteration."
<< std::endl;
if (control_logic_enabled && !ctrl_module->rollback_enabled) { std::vector<double> sur_unshuffled{mpi_surr_buffer};
std::cout << "[Master] Control logic enabled for this iteration." << std::endl; unshuffleField(mpi_surr_buffer, this->n_cells, this->prop_count,
std::vector<double> sur_unshuffled{ctrl_module->sur_shuffled};
unshuffleField(ctrl_module->sur_shuffled, this->n_cells, this->prop_count,
wp_sizes_vector.size(), sur_unshuffled); wp_sizes_vector.size(), sur_unshuffled);
ctrl_module->computeSpeciesErrors(out_vec, sur_unshuffled, this->n_cells); control_module->computeSpeciesErrors(out_vec, sur_unshuffled,
this->n_cells);
} }
/* end time measurement of control logic */
ctrl_b = MPI_Wtime();
this->ctrl_t += ctrl_b - ctrl_a;
/* start time measurement of master chemistry */ /* start time measurement of master chemistry */
sim_e_chemistry = MPI_Wtime(); sim_e_chemistry = MPI_Wtime();

File diff suppressed because it is too large Load Diff

View File

@ -4,15 +4,23 @@
#include "IO/StatsIO.hpp" #include "IO/StatsIO.hpp"
#include <cmath> #include <cmath>
bool poet::ControlModule::isControlIteration(uint32_t iter) { void poet::ControlModule::updateControlIteration(const uint32_t iter) {
global_iteration = iter;
if (control_interval == 0) {
control_interval_enabled = false;
return;
}
control_interval_enabled = (iter % control_interval == 0); control_interval_enabled = (iter % control_interval == 0);
if (control_interval_enabled) { if (control_interval_enabled) {
MSG("[Control] Control interval triggered at iteration " + MSG("[Control] Control interval enabled at iteration " +
std::to_string(iter)); std::to_string(iter));
} }
return control_interval_enabled;
} }
/*
void poet::ControlModule::beginIteration() { void poet::ControlModule::beginIteration() {
if (rollback_enabled) { if (rollback_enabled) {
if (sur_disabled_counter > 0) { if (sur_disabled_counter > 0) {
@ -23,19 +31,23 @@ void poet::ControlModule::beginIteration() {
} }
} }
} }
*/
void poet::ControlModule::endIteration(uint32_t iter) { void poet::ControlModule::endIteration(const uint32_t iter) {
if (!control_interval_enabled) {
return;
}
/* Writing a checkpointing */ /* Writing a checkpointing */
if (checkpoint_interval > 0 && iter % checkpoint_interval == 0) { /* Control Logic*/
if (control_interval_enabled &&
checkpoint_interval > 0 /*&& !rollback_enabled*/) {
MSG("Writing checkpoint of iteration " + std::to_string(iter)); MSG("Writing checkpoint of iteration " + std::to_string(iter));
write_checkpoint(out_dir, "checkpoint" + std::to_string(iter) + ".hdf5", write_checkpoint(out_dir, "checkpoint" + std::to_string(iter) + ".hdf5",
{.field = chem->getField(), .iteration = iter}); {.field = chem->getField(), .iteration = iter});
} writeStatsToCSV(error_history, species_names, out_dir, "stats_overview");
/* Control Logic*/ /*
if (control_interval_enabled && !rollback_enabled) {
writeStatsToCSV(error_history, species_names, out_dir,
"stats_overview");
if (triggerRollbackIfExceeded(*chem, *params, iter)) { if (triggerRollbackIfExceeded(*chem, *params, iter)) {
rollback_enabled = true; rollback_enabled = true;
@ -44,9 +56,12 @@ void poet::ControlModule::endIteration(uint32_t iter) {
MSG("Interpolation disabled for the next " + MSG("Interpolation disabled for the next " +
std::to_string(control_interval) + "."); std::to_string(control_interval) + ".");
} }
*/
} }
} }
/*
void poet::ControlModule::BCastControlFlags() { void poet::ControlModule::BCastControlFlags() {
int interp_flag = rollback_enabled ? 0 : 1; int interp_flag = rollback_enabled ? 0 : 1;
int dht_fill_flag = rollback_enabled ? 1 : 0; int dht_fill_flag = rollback_enabled ? 1 : 0;
@ -54,6 +69,9 @@ void poet::ControlModule::BCastControlFlags() {
chem->ChemBCast(&dht_fill_flag, 1, MPI_INT); chem->ChemBCast(&dht_fill_flag, 1, MPI_INT);
} }
*/
/*
bool poet::ControlModule::triggerRollbackIfExceeded(ChemistryModule &chem, bool poet::ControlModule::triggerRollbackIfExceeded(ChemistryModule &chem,
RuntimeParameters &params, RuntimeParameters &params,
uint32_t &iter) { uint32_t &iter) {
@ -91,17 +109,20 @@ bool poet::ControlModule::triggerRollbackIfExceeded(ChemistryModule &chem,
} }
} }
MSG("All species are within their MAPE and RRMSE thresholds."); MSG("All species are within their MAPE and RRMSE thresholds.");
return false; return
}
false;
}
*/
void poet::ControlModule::computeSpeciesErrors( void poet::ControlModule::computeSpeciesErrors(
const std::vector<double> &reference_values, const std::vector<double> &reference_values,
const std::vector<double> &surrogate_values, uint32_t size_per_prop) { const std::vector<double> &surrogate_values, const uint32_t size_per_prop) {
SimulationErrorStats species_error_stats(species_count, params->global_iter, SimulationErrorStats species_error_stats(this->species_names.size(),
rollback_counter); global_iteration,
/*rollback_counter*/ 0);
for (uint32_t i = 0; i < species_count; ++i) { for (uint32_t i = 0; i < this->species_names.size(); ++i) {
double err_sum = 0.0; double err_sum = 0.0;
double sqr_err_sum = 0.0; double sqr_err_sum = 0.0;
uint32_t base_idx = i * size_per_prop; uint32_t base_idx = i * size_per_prop;

View File

@ -16,43 +16,24 @@ class ChemistryModule;
class ControlModule { class ControlModule {
public: public:
ControlModule(RuntimeParameters *run_params, ChemistryModule *chem_module)
: params(run_params), chem(chem_module) {};
/* Control configuration*/ /* Control configuration*/
std::vector<std::string> species_names;
uint32_t species_count = 0;
std::string out_dir;
bool rollback_enabled = false; // std::uint32_t global_iter = 0;
bool control_interval_enabled = false; // std::uint32_t sur_disabled_counter = 0;
// std::uint32_t rollback_counter = 0;
std::uint32_t global_iter = 0; void updateControlIteration(const uint32_t iter);
std::uint32_t sur_disabled_counter = 0;
std::uint32_t rollback_counter = 0;
std::uint32_t checkpoint_interval = 0;
std::uint32_t control_interval = 0;
std::vector<double> mape_threshold; auto GetGlobalIteration() const noexcept { return global_iteration; }
std::vector<double> rrmse_threshold;
double ctrl_t = 0.; // void beginIteration();
double bcast_ctrl_t = 0.;
double recv_ctrl_t = 0.;
/* Buffer for shuffled surrogate data */ void endIteration(const uint32_t iter);
std::vector<double> sur_shuffled;
bool isControlIteration(uint32_t iter); // void BCastControlFlags();
void beginIteration(); //bool triggerRollbackIfExceeded(ChemistryModule &chem,
// RuntimeParameters &params, uint32_t &iter);
void endIteration(uint32_t iter);
void BCastControlFlags();
bool triggerRollbackIfExceeded(ChemistryModule &chem,
RuntimeParameters &params, uint32_t &iter);
struct SimulationErrorStats { struct SimulationErrorStats {
std::vector<double> mape; std::vector<double> mape;
@ -60,14 +41,14 @@ public:
uint32_t iteration; // iterations in simulation after rollbacks uint32_t iteration; // iterations in simulation after rollbacks
uint32_t rollback_count; uint32_t rollback_count;
SimulationErrorStats(size_t species_count, uint32_t iter, uint32_t counter) SimulationErrorStats(uint32_t species_count, uint32_t iter, uint32_t counter)
: mape(species_count, 0.0), rrmse(species_count, 0.0), iteration(iter), : mape(species_count, 0.0), rrmse(species_count, 0.0), iteration(iter),
rollback_count(counter) {} rollback_count(counter) {}
}; };
static void computeSpeciesErrors(const std::vector<double> &reference_values, void computeSpeciesErrors(const std::vector<double> &reference_values,
const std::vector<double> &surrogate_values, const std::vector<double> &surrogate_values,
uint32_t size_per_prop); const uint32_t size_per_prop);
std::vector<SimulationErrorStats> error_history; std::vector<SimulationErrorStats> error_history;
@ -75,34 +56,53 @@ public:
std::string out_dir; std::string out_dir;
std::uint32_t checkpoint_interval; std::uint32_t checkpoint_interval;
std::uint32_t control_interval; std::uint32_t control_interval;
std::uint32_t species_count;
std::vector<std::string> species_names; std::vector<std::string> species_names;
std::vector<double> mape_threshold; std::vector<double> mape_threshold;
std::vector<double> rrmse_threshold;
}; };
void enableControlLogic(const ControlSetup &setup) { void enableControlLogic(const ControlSetup &setup) {
out_dir = setup.out_dir; this->out_dir = setup.out_dir;
checkpoint_interval = setup.checkpoint_interval; this->checkpoint_interval = setup.checkpoint_interval;
control_interval = setup.control_interval; this->control_interval = setup.control_interval;
species_count = setup.species_count; this->species_names = setup.species_names;
this->mape_threshold = setup.mape_threshold;
species_names = setup.species_names;
mape_threshold = setup.mape_threshold;
rrmse_threshold = setup.rrmse_threshold;
} }
bool GetControlIntervalEnabled() const {
return this->control_interval_enabled;
}
auto GetControlInterval() const { return this->control_interval; }
std::vector<double> GetMapeThreshold() const { return this->mape_threshold; }
/* Profiling getters */ /* Profiling getters */
auto GetMasterCtrlLogicTime() const { return this->ctrl_t; } auto GetMasterCtrlLogicTime() const { return this->ctrl_time; }
auto GetMasterCtrlBcastTime() const { return this->bcast_ctrl_t; } auto GetMasterCtrlBcastTime() const { return this->bcast_ctrl_time; }
auto GetMasterRecvCtrlLogicTime() const { return this->recv_ctrl_t; } auto GetMasterRecvCtrlLogicTime() const { return this->recv_ctrl_time; }
private: private:
RuntimeParameters *params; bool rollback_enabled = false;
ChemistryModule *chem; bool control_interval_enabled = false;
poet::ChemistryModule *chem = nullptr;
std::uint32_t checkpoint_interval = 0;
std::uint32_t control_interval = 0;
std::uint32_t global_iteration = 0;
std::vector<double> mape_threshold;
std::vector<std::string> species_names;
std::string out_dir;
double ctrl_time = 0.0;
double bcast_ctrl_time = 0.0;
double recv_ctrl_time = 0.0;
/* Buffer for shuffled surrogate data */
std::vector<double> sur_shuffled;
}; };
} // namespace poet } // namespace poet

View File

@ -25,7 +25,7 @@
#include "Base/RInsidePOET.hpp" #include "Base/RInsidePOET.hpp"
#include "CLI/CLI.hpp" #include "CLI/CLI.hpp"
#include "Chemistry/ChemistryModule.hpp" #include "Chemistry/ChemistryModule.hpp"
#include "Control/ControlManager.hpp" #include "Control/ControlModule.hpp"
#include "DataStructures/Field.hpp" #include "DataStructures/Field.hpp"
#include "Init/InitialList.hpp" #include "Init/InitialList.hpp"
#include "Transport/DiffusionModule.hpp" #include "Transport/DiffusionModule.hpp"
@ -255,8 +255,6 @@ int parseInitValues(int argc, char **argv, RuntimeParameters &params) {
Rcpp::as<uint32_t>(global_rt_setup->operator[]("checkpoint_interval")); Rcpp::as<uint32_t>(global_rt_setup->operator[]("checkpoint_interval"));
params.mape_threshold = Rcpp::as<std::vector<double>>( params.mape_threshold = Rcpp::as<std::vector<double>>(
global_rt_setup->operator[]("mape_threshold")); global_rt_setup->operator[]("mape_threshold"));
params.rrmse_threshold = Rcpp::as<std::vector<double>>(
global_rt_setup->operator[]("rrmse_threshold"));
} catch (const std::exception &e) { } catch (const std::exception &e) {
ERRMSG("Error while parsing R scripts: " + std::string(e.what())); ERRMSG("Error while parsing R scripts: " + std::string(e.what()));
return ParseRet::PARSER_ERROR; return ParseRet::PARSER_ERROR;
@ -300,7 +298,6 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters &params,
/* SIMULATION LOOP */ /* SIMULATION LOOP */
double dSimTime{0}; double dSimTime{0};
double chkTime = 0.0;
for (uint32_t iter = 1; iter < maxiter + 1; iter++) { for (uint32_t iter = 1; iter < maxiter + 1; iter++) {
// Rollback countdowm // Rollback countdowm
@ -315,10 +312,10 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters &params,
} }
} }
*/ */
control.beginIteration(iter); //control.beginIteration(iter);
// params.global_iter = iter; // params.global_iter = iter;
control.isControlIteration(iter); control.updateControlIteration(iter);
// params.control_interval_enabled = (iter % params.control_interval == 0); // params.control_interval_enabled = (iter % params.control_interval == 0);
double start_t = MPI_Wtime(); double start_t = MPI_Wtime();
@ -431,8 +428,7 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters &params,
MSG("End of *coupling* iteration " + std::to_string(iter) + "/" + MSG("End of *coupling* iteration " + std::to_string(iter) + "/" +
std::to_string(maxiter)); std::to_string(maxiter));
double chk_start = MPI_Wtime(); control.endIteration(iter);
control.endIteration(iter)
/* /*
if (iter % params.checkpoint_interval == 0) { if (iter % params.checkpoint_interval == 0) {
MSG("Writing checkpoint of iteration " + std::to_string(iter)); MSG("Writing checkpoint of iteration " + std::to_string(iter));
@ -457,8 +453,7 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters &params,
*/ */
double chk_end = MPI_Wtime();
chkTime += chk_end - chk_start;
// MSG(); // MSG();
} // END SIMULATION LOOP } // END SIMULATION LOOP
@ -476,13 +471,14 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters &params,
Rcpp::List diffusion_profiling; Rcpp::List diffusion_profiling;
diffusion_profiling["simtime"] = diffusion.getTransportTime(); diffusion_profiling["simtime"] = diffusion.getTransportTime();
Rcpp::List ctrl_profiling; /*Rcpp::List ctrl_profiling;
ctrl_profiling["checkpointing_time"] = chkTime; ctrl_profiling["checkpointing_time"] = chkTime;
ctrl_profiling["ctrl_logic_master"] = chem.GetMasterCtrlLogicTime(); ctrl_profiling["ctrl_logic_master"] = chem.GetMasterCtrlLogicTime();
ctrl_profiling["bcast_ctrl_logic_master"] = chem.GetMasterCtrlBcastTime(); ctrl_profiling["bcast_ctrl_logic_master"] = chem.GetMasterCtrlBcastTime();
ctrl_profiling["recv_ctrl_logic_maser"] = chem.GetMasterRecvCtrlLogicTime(); ctrl_profiling["recv_ctrl_logic_maser"] = chem.GetMasterRecvCtrlLogicTime();
ctrl_profiling["ctrl_logic_worker"] = ctrl_profiling["ctrl_logic_worker"] =
Rcpp::wrap(chem.GetWorkerControlTimings()); Rcpp::wrap(chem.GetWorkerControlTimings());
*/
if (params.use_dht) { if (params.use_dht) {
chem_profiling["dht_hits"] = Rcpp::wrap(chem.GetWorkerDHTHits()); chem_profiling["dht_hits"] = Rcpp::wrap(chem.GetWorkerDHTHits());
@ -510,7 +506,7 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters &params,
profiling["simtime"] = dSimTime; profiling["simtime"] = dSimTime;
profiling["chemistry"] = chem_profiling; profiling["chemistry"] = chem_profiling;
profiling["diffusion"] = diffusion_profiling; profiling["diffusion"] = diffusion_profiling;
profiling["ctrl_logic"] = ctrl_profiling; //profiling["ctrl_logic"] = ctrl_profiling;
chem.MasterLoopBreak(); chem.MasterLoopBreak();
@ -652,7 +648,10 @@ int main(int argc, char *argv[]) {
ChemistryModule chemistry(run_params.work_package_size, ChemistryModule chemistry(run_params.work_package_size,
init_list.getChemistryInit(), MPI_COMM_WORLD); init_list.getChemistryInit(), MPI_COMM_WORLD);
ControlModule control(&run_params, &chemistry);
ControlModule control;
chemistry.setControlModule(&control);
const ChemistryModule::SurrogateSetup surr_setup = { const ChemistryModule::SurrogateSetup surr_setup = {
getSpeciesNames(init_list.getInitialGrid(), 0, MPI_COMM_WORLD), getSpeciesNames(init_list.getInitialGrid(), 0, MPI_COMM_WORLD),
@ -674,14 +673,11 @@ int main(int argc, char *argv[]) {
run_params.out_dir, // added run_params.out_dir, // added
run_params.checkpoint_interval, run_params.checkpoint_interval,
run_params.control_interval, run_params.control_interval,
run_params.species_count, getSpeciesNames(init_list.getInitialGrid(), 0, MPI_COMM_WORLD),
run_params.species_names, run_params.mape_threshold};
run_params.mape_threshold,
run_params.rrmse_threshold};
control.enableControlLogic(ctrl_setup); control.enableControlLogic(ctrl_setup);
if (MY_RANK > 0) { if (MY_RANK > 0) {
chemistry.WorkerLoop(); chemistry.WorkerLoop();
} else { } else {
@ -725,7 +721,7 @@ int main(int argc, char *argv[]) {
chemistry.masterSetField(init_list.getInitialGrid()); chemistry.masterSetField(init_list.getInitialGrid());
Rcpp::List profiling = RunMasterLoop(R, run_params, diffusion, chemistry); Rcpp::List profiling = RunMasterLoop(R, run_params, diffusion, chemistry, control);
MSG("finished simulation loop"); MSG("finished simulation loop");

View File

@ -51,15 +51,9 @@ struct RuntimeParameters {
bool print_progress = false; bool print_progress = false;
bool rollback_enabled = false;
bool control_interval_enabled = false;
std::uint32_t global_iter = 0;
std::uint32_t sur_disabled_counter = 0;
std::uint32_t rollback_counter = 0;
std::uint32_t checkpoint_interval = 0; std::uint32_t checkpoint_interval = 0;
std::uint32_t control_interval = 0; std::uint32_t control_interval = 0;
std::vector<double> mape_threshold; std::vector<double> mape_threshold;
std::vector<double> rrmse_threshold;
static constexpr std::uint32_t WORK_PACKAGE_SIZE_DEFAULT = 32; static constexpr std::uint32_t WORK_PACKAGE_SIZE_DEFAULT = 32;
std::uint32_t work_package_size = WORK_PACKAGE_SIZE_DEFAULT; std::uint32_t work_package_size = WORK_PACKAGE_SIZE_DEFAULT;