Compare commits

...

3 Commits

Author SHA1 Message Date
rastogi
cf64110dee Added missing parameters to rt file 2025-12-08 18:01:54 +01:00
rastogi
a8af28c73a Add _rt.R file to repo 2025-12-08 17:38:49 +01:00
rastogi
81aed9826d feat: Added rb_limit, rb_interval_limit and condition for bcasting control flags 2025-12-08 17:17:26 +01:00
12 changed files with 564 additions and 423 deletions

1
.gitignore vendored
View File

@ -155,6 +155,7 @@ include/
# But keep these specific files # But keep these specific files
!bin/plot/ !bin/plot/
!bin/dolo_fgcs_3.qs2 !bin/dolo_fgcs_3.qs2
!bin/dolo_fgcs_3_rt.R
!bin/dolo_fgcs_3.R !bin/dolo_fgcs_3.R
!bin/dolo_fgcs_3.pqi !bin/dolo_fgcs_3.pqi
!bin/phreeqc_kin.dat !bin/phreeqc_kin.dat

26
bin/dolo_fgcs_3_rt.R Normal file
View File

@ -0,0 +1,26 @@
iterations <- 10000
dt <- 200
chkpt_interval <- 100
stab_interval <- 100
mape_threshold <- rep(0.0035, 13)
mape_threshold[5] <- 1 #Charge
zero_abs <- 1e-13
rb_limit <- 3
rb_interval_limit <- 100
ctrl_cell_ids <- seq(0, (400*400)/2 - 1, by = 401)
#out_save <- seq(500, iterations, by = 500)
out_save = c(seq(1, 10), seq(10, 100, by= 10), seq(200, iterations, by=100))
list(
timesteps = rep(dt, iterations),
store_result = TRUE,
out_save = out_save,
chkpt_interval = chkpt_interval,
stab_interval = stab_interval,
mape_threshold = mape_threshold,
zero_abs = zero_abs,
rb_limit = rb_limit,
rb_interval_limit = rb_interval_limit,
ctrl_cell_ids = ctrl_cell_ids
)

View File

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
#SBATCH --job-name=p2_eps01_no_skip #SBATCH --job-name=p2_eps0035_v3
#SBATCH --output=p2_eps01_no_skip_%j.out #SBATCH --output=p2_eps0035_v3_%j.out
#SBATCH --error=p2_eps01_no_skip_%j.err #SBATCH --error=p2_eps0035_v3_%j.err
#SBATCH --partition=long #SBATCH --partition=long
#SBATCH --nodes=6 #SBATCH --nodes=6
#SBATCH --ntasks-per-node=24 #SBATCH --ntasks-per-node=24
@ -15,5 +15,5 @@ module purge
module load cmake gcc openmpi module load cmake gcc openmpi
#mpirun -n 144 ./poet dolo_fgcs_3.R dolo_fgcs_3.qs2 dolo_only_pqc #mpirun -n 144 ./poet dolo_fgcs_3.R dolo_fgcs_3.qs2 dolo_only_pqc
mpirun -n 144 ./poet --interp --rds dolo_fgcs_3_rt.R dolo_fgcs_3.qs2 p2_eps01_no_skip mpirun -n 144 ./poet --interp --rds dolo_fgcs_3_rt.R dolo_fgcs_3.qs2 p2_eps0035_v3
#mpirun -n 144 ./poet --interp barite_fgcs_4_new/barite_fgcs_4_new_rt.R barite_fgcs_4_new/barite_fgcs_4_new.qs2 barite #mpirun -n 144 ./poet --interp barite_fgcs_4_new/barite_fgcs_4_new_rt.R barite_fgcs_4_new/barite_fgcs_4_new.qs2 barite

View File

@ -44,8 +44,7 @@ public:
* \param wp_size Count of grid cells to fill each work package at maximum. * \param wp_size Count of grid cells to fill each work package at maximum.
* \param communicator MPI communicator to distribute work in. * \param communicator MPI communicator to distribute work in.
*/ */
ChemistryModule(uint32_t wp_size, ChemistryModule(uint32_t wp_size, const InitialList::ChemistryInit chem_params,
const InitialList::ChemistryInit chem_params,
MPI_Comm communicator); MPI_Comm communicator);
/** /**
@ -71,8 +70,7 @@ public:
auto GetChemistryTime() const { return this->chem_t; } auto GetChemistryTime() const { return this->chem_t; }
void setFilePadding(std::uint32_t maxiter) { void setFilePadding(std::uint32_t maxiter) {
this->file_pad = this->file_pad = static_cast<std::uint8_t>(std::ceil(std::log10(maxiter + 1)));
static_cast<std::uint8_t>(std::ceil(std::log10(maxiter + 1)));
} }
struct SurrogateSetup { struct SurrogateSetup {
@ -105,8 +103,7 @@ public:
this->ctrl_file_out_dir = setup.dht_out_dir; this->ctrl_file_out_dir = setup.dht_out_dir;
if (this->dht_enabled || this->interp_enabled) { if (this->dht_enabled || this->interp_enabled) {
this->initializeDHT(setup.dht_size_mb, this->params.dht_species, this->initializeDHT(setup.dht_size_mb, this->params.dht_species, setup.has_het_ids);
setup.has_het_ids);
if (setup.dht_snaps != DHT_SNAPS_DISABLED) { if (setup.dht_snaps != DHT_SNAPS_DISABLED) {
this->setDHTSnapshots(setup.dht_snaps, setup.dht_out_dir); this->setDHTSnapshots(setup.dht_snaps, setup.dht_out_dir);
@ -115,8 +112,7 @@ public:
if (this->interp_enabled) { if (this->interp_enabled) {
this->initializeInterp(setup.interp_bucket_size, setup.interp_size_mb, this->initializeInterp(setup.interp_bucket_size, setup.interp_size_mb,
setup.interp_min_entries, setup.interp_min_entries, this->params.interp_species);
this->params.interp_species);
} }
} }
@ -239,9 +235,7 @@ public:
* *
* \param enabled True if print progressbar, false if not. * \param enabled True if print progressbar, false if not.
*/ */
void setProgressBarPrintout(bool enabled) { void setProgressBarPrintout(bool enabled) { this->print_progessbar = enabled; };
this->print_progessbar = enabled;
};
/** /**
* **Master only** Set the ai surrogate validity vector from R * **Master only** Set the ai surrogate validity vector from R
@ -275,8 +269,7 @@ public:
} }
protected: protected:
void initializeDHT(uint32_t size_mb, void initializeDHT(uint32_t size_mb, const NamedVector<std::uint32_t> &key_species,
const NamedVector<std::uint32_t> &key_species,
bool has_het_ids); bool has_het_ids);
void setDHTSnapshots(int type, const std::string &out_dir); void setDHTSnapshots(int type, const std::string &out_dir);
void setDHTReadFile(const std::string &input_file); void setDHTReadFile(const std::string &input_file);
@ -349,9 +342,8 @@ protected:
void MasterRunSequential(); void MasterRunSequential();
void MasterSendPkgs(worker_list_t &w_list, workpointer_t &work_pointer, void MasterSendPkgs(worker_list_t &w_list, workpointer_t &work_pointer,
workpointer_t &sur_pointer, int &pkg_to_send, workpointer_t &sur_pointer, int &pkg_to_send, int &count_pkgs,
int &count_pkgs, int &free_workers, double dt, int &free_workers, double dt, uint32_t iteration,
uint32_t iteration,
const std::vector<uint32_t> &wp_sizes_vector); const std::vector<uint32_t> &wp_sizes_vector);
void MasterRecvPkgs(worker_list_t &w_list, int &pkg_to_recv, bool to_send, void MasterRecvPkgs(worker_list_t &w_list, int &pkg_to_recv, bool to_send,
int &free_workers); int &free_workers);
@ -361,8 +353,7 @@ protected:
void WorkerProcessPkgs(struct worker_s &timings, uint32_t &iteration); void WorkerProcessPkgs(struct worker_s &timings, uint32_t &iteration);
void WorkerDoWork(MPI_Status &probe_status, int double_count, void WorkerDoWork(MPI_Status &probe_status, int double_count, struct worker_s &timings);
struct worker_s &timings);
void WorkerPostIter(MPI_Status &probe_status, uint32_t iteration); void WorkerPostIter(MPI_Status &probe_status, uint32_t iteration);
void WorkerPostSim(uint32_t iteration); void WorkerPostSim(uint32_t iteration);
@ -372,26 +363,23 @@ protected:
void WorkerPerfToMaster(int type, const struct worker_s &timings); void WorkerPerfToMaster(int type, const struct worker_s &timings);
void WorkerMetricsToMaster(int type); void WorkerMetricsToMaster(int type);
void WorkerRunWorkPackage(WorkPackage &work_package, double dSimTime, void WorkerRunWorkPackage(WorkPackage &work_package, double dSimTime, double dTimestep);
double dTimestep);
std::vector<uint32_t> CalculateWPSizesVector(uint32_t n_cells, std::vector<uint32_t> CalculateWPSizesVector(uint32_t n_cells, uint32_t wp_size) const;
uint32_t wp_size) const;
std::vector<double> shuffleField(const std::vector<double> &in_field, std::vector<double> shuffleField(const std::vector<double> &in_field,
uint32_t size_per_prop, uint32_t prop_count, uint32_t size_per_prop, uint32_t prop_count,
uint32_t wp_count); uint32_t wp_count);
void unshuffleField(const std::vector<double> &in_buffer, void unshuffleField(const std::vector<double> &in_buffer, uint32_t size_per_prop,
uint32_t size_per_prop, uint32_t prop_count, uint32_t prop_count, uint32_t wp_count,
uint32_t wp_count, std::vector<double> &out_field); std::vector<double> &out_field);
std::vector<std::int32_t> std::vector<std::int32_t>
parseDHTSpeciesVec(const NamedVector<std::uint32_t> &key_species, parseDHTSpeciesVec(const NamedVector<std::uint32_t> &key_species,
const std::vector<std::string> &to_compare) const; const std::vector<std::string> &to_compare) const;
void BCastStringVec(std::vector<std::string> &io); void BCastStringVec(std::vector<std::string> &io);
void processCtrlPkgs(std::vector<std::vector<double>> &input, void processCtrlPkgs(std::vector<std::vector<double>> &input, double current_sim_time,
double current_sim_time, double dt, double dt, struct worker_s &timings);
struct worker_s &timings);
void copyPkgs(const WorkPackage &wp, std::vector<double> &mpi_buffer); void copyPkgs(const WorkPackage &wp, std::vector<double> &mpi_buffer);
@ -408,7 +396,11 @@ protected:
} }
inline bool hasFlag(int flags, int type) { return (flags & type) != 0; } inline bool hasFlag(int flags, int type) { return (flags & type) != 0; }
inline void
extractCtrlSurVectors(std::unordered_map<uint32_t, std::vector<double>> ctrl_output,
const std::vector<double> &buffer, uint32_t n_cells,
uint32_t prop_count, std::vector<std::vector<double>> &ctrl_vec,
std::vector<std::vector<double>> &sur_vec);
int comm_size, comm_rank; int comm_size, comm_rank;
MPI_Comm group_comm; MPI_Comm group_comm;
@ -434,11 +426,7 @@ protected:
MPI_Bcast(buf, count, datatype, 0, this->group_comm); MPI_Bcast(buf, count, datatype, 0, this->group_comm);
} }
inline void PropagateFunctionType(int &type) const { inline void PropagateFunctionType(int &type) const { ChemBCast(&type, 1, MPI_INT); }
ChemBCast(&type, 1, MPI_INT);
}
double simtime = 0.; double simtime = 0.;
double idle_t = 0.; double idle_t = 0.;
@ -476,7 +464,7 @@ protected:
bool control_enabled{false}; bool control_enabled{false};
bool stab_enabled{false}; bool stab_enabled{false};
std::unordered_set<uint32_t> ctrl_cell_ids; std::unordered_set<uint32_t> ctrl_cell_ids;
std::vector<std::vector<double>> ctrl_batch; std::unordered_map<uint32_t, std::vector<double>> ctrl_batch;
}; };
} // namespace poet } // namespace poet

View File

@ -4,6 +4,8 @@
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <mpi.h> #include <mpi.h>
#include <numeric>
#include <unordered_map>
#include <vector> #include <vector>
std::vector<uint32_t> poet::ChemistryModule::MasterGatherWorkerMetrics(int type) const { std::vector<uint32_t> poet::ChemistryModule::MasterGatherWorkerMetrics(int type) const {
@ -12,7 +14,8 @@ std::vector<uint32_t> poet::ChemistryModule::MasterGatherWorkerMetrics(int type)
uint32_t dummy; uint32_t dummy;
std::vector<uint32_t> metrics(this->comm_size); std::vector<uint32_t> metrics(this->comm_size);
MPI_Gather(&dummy, 1, MPI_UINT32_T, metrics.data(), 1, MPI_UINT32_T, 0, this->group_comm); MPI_Gather(&dummy, 1, MPI_UINT32_T, metrics.data(), 1, MPI_UINT32_T, 0,
this->group_comm);
metrics.erase(metrics.begin()); metrics.erase(metrics.begin());
return metrics; return metrics;
@ -72,8 +75,8 @@ std::vector<uint32_t> poet::ChemistryModule::GetWorkerDHTHits() const {
MPI_Get_count(&probe, MPI_UINT32_T, &count); MPI_Get_count(&probe, MPI_UINT32_T, &count);
std::vector<uint32_t> ret(count); std::vector<uint32_t> ret(count);
MPI_Recv(ret.data(), count, MPI_UINT32_T, probe.MPI_SOURCE, WORKER_DHT_HITS, this->group_comm, MPI_Recv(ret.data(), count, MPI_UINT32_T, probe.MPI_SOURCE, WORKER_DHT_HITS,
NULL); this->group_comm, NULL);
return ret; return ret;
} }
@ -114,7 +117,8 @@ std::vector<double> poet::ChemistryModule::GetWorkerInterpolationGatherTimings()
return MasterGatherWorkerTimings(WORKER_IP_GATHER); return MasterGatherWorkerTimings(WORKER_IP_GATHER);
} }
std::vector<double> poet::ChemistryModule::GetWorkerInterpolationFunctionCallTimings() const { std::vector<double>
poet::ChemistryModule::GetWorkerInterpolationFunctionCallTimings() const {
int type = CHEM_PERF; int type = CHEM_PERF;
MPI_Bcast(&type, 1, MPI_INT, 0, this->group_comm); MPI_Bcast(&type, 1, MPI_INT, 0, this->group_comm);
return MasterGatherWorkerTimings(WORKER_IP_FC); return MasterGatherWorkerTimings(WORKER_IP_FC);
@ -132,8 +136,8 @@ std::vector<uint32_t> poet::ChemistryModule::GetWorkerInterpolationCalls() const
MPI_Get_count(&probe, MPI_UINT32_T, &count); MPI_Get_count(&probe, MPI_UINT32_T, &count);
std::vector<uint32_t> ret(count); std::vector<uint32_t> ret(count);
MPI_Recv(ret.data(), count, MPI_UINT32_T, probe.MPI_SOURCE, WORKER_IP_CALLS, this->group_comm, MPI_Recv(ret.data(), count, MPI_UINT32_T, probe.MPI_SOURCE, WORKER_IP_CALLS,
NULL); this->group_comm, NULL);
return ret; return ret;
} }
@ -150,13 +154,14 @@ std::vector<uint32_t> poet::ChemistryModule::GetWorkerPHTCacheHits() const {
MPI_Get_count(&probe, MPI_UINT32_T, &count); MPI_Get_count(&probe, MPI_UINT32_T, &count);
std::vector<uint32_t> ret(count); std::vector<uint32_t> ret(count);
MPI_Recv(ret.data(), count, MPI_UINT32_T, probe.MPI_SOURCE, type, this->group_comm, NULL); MPI_Recv(ret.data(), count, MPI_UINT32_T, probe.MPI_SOURCE, type, this->group_comm,
NULL);
return ret; return ret;
} }
inline std::vector<int> shuffleVector(const std::vector<int> &in_vector, uint32_t size_per_prop, inline std::vector<int> shuffleVector(const std::vector<int> &in_vector,
uint32_t wp_count) { uint32_t size_per_prop, uint32_t wp_count) {
std::vector<int> out_buffer(in_vector.size()); std::vector<int> out_buffer(in_vector.size());
uint32_t write_i = 0; uint32_t write_i = 0;
for (uint32_t i = 0; i < wp_count; i++) { for (uint32_t i = 0; i < wp_count; i++) {
@ -168,8 +173,9 @@ inline std::vector<int> shuffleVector(const std::vector<int> &in_vector, uint32_
return out_buffer; return out_buffer;
} }
inline std::vector<double> shuffleField(const std::vector<double> &in_field, uint32_t size_per_prop, inline std::vector<double> shuffleField(const std::vector<double> &in_field,
uint32_t species_count, uint32_t wp_count) { uint32_t size_per_prop, uint32_t species_count,
uint32_t wp_count) {
std::vector<double> out_buffer(in_field.size()); std::vector<double> out_buffer(in_field.size());
uint32_t write_i = 0; uint32_t write_i = 0;
for (uint32_t i = 0; i < wp_count; i++) { for (uint32_t i = 0; i < wp_count; i++) {
@ -216,20 +222,45 @@ inline void printProgressbar(int count_pkgs, int n_wp, int barWidth = 70) {
std::cout.flush(); std::cout.flush();
/* end visual progress */ /* end visual progress */
} }
inline void poet::ChemistryModule::extractCtrlSurVectors(
std::unordered_map<uint32_t, std::vector<double>> ctrl_output,
const std::vector<double> &buffer, uint32_t n_cells, uint32_t prop_count,
std::vector<std::vector<double>> &ctrl_vec,
std::vector<std::vector<double>> &sur_vec) {
ctrl_vec.reserve(ctrl_output.size());
sur_vec.reserve(ctrl_output.size());
/* for (const auto &item : ctrl_output) {
const uint32_t id = item.first;
const auto &data = item.second;
std::vector<std::vector<double>> extractSurCells(){ ctrl_vec.push_back(data);
/* extract matching surrogate output*/
bool found = false;
for (uint32_t i = 0; i < n_cells; i++) {
uint32_t curr_id = static_cast<uint32_t>(buffer[prop_count * i]);
if (curr_id == id) {
sur_vec.emplace_back(buffer.begin() + prop_count * i,
buffer.begin() + prop_count * (i + 1));
found = true;
break;
}
}
if (!found) {
/* keep alignment if missing */
std::vector<double> vec(prop_count, 0.0);
vec[0] = static_cast<double>(id);
sur_vec.emplace_back(vec);
}
}
} }
*/
inline void poet::ChemistryModule::MasterSendPkgs(worker_list_t &w_list, inline void poet::ChemistryModule::MasterSendPkgs(
workpointer_t &work_pointer, worker_list_t &w_list, workpointer_t &work_pointer, workpointer_t &sur_pointer,
workpointer_t &sur_pointer, int &pkg_to_send, int &pkg_to_send, int &count_pkgs, int &free_workers, double dt, uint32_t iteration,
int &count_pkgs, int &free_workers, double dt, const std::vector<uint32_t> &wp_sizes_vector) {
uint32_t iteration,
const std::vector<uint32_t> &wp_sizes_vector) {
/* declare variables */ /* declare variables */
int local_work_package_size; int local_work_package_size;
@ -243,8 +274,8 @@ inline void poet::ChemistryModule::MasterSendPkgs(worker_list_t &w_list,
local_work_package_size = (int)wp_sizes_vector[count_pkgs]; local_work_package_size = (int)wp_sizes_vector[count_pkgs];
count_pkgs++; count_pkgs++;
uint32_t wp_start_index = std::accumulate(wp_sizes_vector.begin(), uint32_t wp_start_index = std::accumulate(
std::next(wp_sizes_vector.begin(), count_pkgs), 0); wp_sizes_vector.begin(), std::next(wp_sizes_vector.begin(), count_pkgs), 0);
/* note current processed work package in workerlist */ /* note current processed work package in workerlist */
w_list[p].send_addr = work_pointer.base(); w_list[p].send_addr = work_pointer.base();
@ -273,12 +304,6 @@ inline void poet::ChemistryModule::MasterSendPkgs(worker_list_t &w_list,
send_buffer[end_of_wp + 4] = wp_start_index; send_buffer[end_of_wp + 4] = wp_start_index;
// control flags (bitmask) // control flags (bitmask)
/*
int flags = (this->interp_enabled ? 1 : 0) | (this->dht_enabled ? 2 : 0) |
(this->warmup_enabled ? 4 : 0) |
(this->control_enabled ? 8 : 0);
send_buffer[end_of_wp + 5] = static_cast<double>(flags);
*/
/* ATTENTION Worker p has rank p+1 */ /* ATTENTION Worker p has rank p+1 */
// MPI_Send(send_buffer, end_of_wp + BUFFER_OFFSET, MPI_DOUBLE, p + 1, // MPI_Send(send_buffer, end_of_wp + BUFFER_OFFSET, MPI_DOUBLE, p + 1,
// LOOP_WORK, this->group_comm); // LOOP_WORK, this->group_comm);
@ -311,11 +336,12 @@ inline void poet::ChemistryModule::MasterRecvPkgs(worker_list_t &w_list, int &pk
// only of there are still packages to send and free workers are available // only of there are still packages to send and free workers are available
if (to_send && free_workers > 0) if (to_send && free_workers > 0)
// non blocking probing // non blocking probing
MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &need_to_receive, &probe_status); MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, this->group_comm, &need_to_receive,
&probe_status);
else { else {
idle_a = MPI_Wtime(); idle_a = MPI_Wtime();
// blocking probing // blocking probing
MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &probe_status); MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, this->group_comm, &probe_status);
idle_b = MPI_Wtime(); idle_b = MPI_Wtime();
this->idle_t += idle_b - idle_a; this->idle_t += idle_b - idle_a;
} }
@ -354,7 +380,11 @@ inline void poet::ChemistryModule::MasterRecvPkgs(worker_list_t &w_list, int &pk
for (std::size_t i = 0; i < cells_per_batch; i++) { for (std::size_t i = 0; i < cells_per_batch; i++) {
std::vector<double> cell_output(recv_buffer.begin() + this->prop_count * i, std::vector<double> cell_output(recv_buffer.begin() + this->prop_count * i,
recv_buffer.begin() + this->prop_count * (i + 1)); recv_buffer.begin() + this->prop_count * (i + 1));
this->ctrl_batch.push_back(cell_output);
if (!cell_output.empty()) {
uint32_t id = static_cast<uint32_t>(cell_output[0]);
this->ctrl_batch[id] = cell_output;
}
} }
break; break;
} }
@ -422,19 +452,19 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
if (this->ai_surrogate_enabled) { if (this->ai_surrogate_enabled) {
ftype = CHEM_AI_BCAST_VALIDITY; ftype = CHEM_AI_BCAST_VALIDITY;
PropagateFunctionType(ftype); PropagateFunctionType(ftype);
this->ai_surrogate_validity_vector = this->ai_surrogate_validity_vector = shuffleVector(
shuffleVector(this->ai_surrogate_validity_vector, this->n_cells, wp_sizes_vector.size()); this->ai_surrogate_validity_vector, this->n_cells, wp_sizes_vector.size());
ChemBCast(&this->ai_surrogate_validity_vector.front(), this->n_cells, MPI_INT); ChemBCast(&this->ai_surrogate_validity_vector.front(), this->n_cells, MPI_INT);
} }
ftype = CHEM_CTRL_FLAGS; if (control->needsFlagBcast()) {
PropagateFunctionType(ftype); ftype = CHEM_CTRL_FLAGS;
uint32_t ctrl_flags = buildCtrlFlags(this->dht_enabled, this->interp_enabled, this->stab_enabled); PropagateFunctionType(ftype);
ChemBCast(&ctrl_flags, 1, MPI_UINT32_T); uint32_t ctrl_flags =
// std::cout << "[Master] Flags mask=" << ctrl_flags buildCtrlFlags(this->dht_enabled, this->interp_enabled, this->stab_enabled);
// << " dht=" << this->dht_enabled ChemBCast(&ctrl_flags, 1, MPI_UINT32_T);
// << " ip=" << this->interp_enabled }
// << " stab=" << this->stab_enabled << std::endl; this->ctrl_batch.clear();
ftype = CHEM_WORK_LOOP; ftype = CHEM_WORK_LOOP;
PropagateFunctionType(ftype); PropagateFunctionType(ftype);
@ -448,8 +478,8 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
/* shuffle grid */ /* shuffle grid */
// grid.shuffleAndExport(mpi_buffer); // grid.shuffleAndExport(mpi_buffer);
std::vector<double> mpi_buffer = std::vector<double> mpi_buffer = shuffleField(chem_field.AsVector(), this->n_cells,
shuffleField(chem_field.AsVector(), this->n_cells, this->prop_count, wp_sizes_vector.size()); this->prop_count, wp_sizes_vector.size());
/* setup local variables */ /* setup local variables */
pkg_to_send = wp_sizes_vector.size(); pkg_to_send = wp_sizes_vector.size();
@ -479,8 +509,8 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
// while there are still packages to send // while there are still packages to send
if (pkg_to_send > 0) { if (pkg_to_send > 0) {
// send packages to all free workers ... // send packages to all free workers ...
MasterSendPkgs(worker_list, work_pointer, sur_pointer, pkg_to_send, i_pkgs, free_workers, dt, MasterSendPkgs(worker_list, work_pointer, sur_pointer, pkg_to_send, i_pkgs,
iteration, wp_sizes_vector); free_workers, dt, iteration, wp_sizes_vector);
} }
// ... and try to receive them from workers who has finished their work // ... and try to receive them from workers who has finished their work
MasterRecvPkgs(worker_list, pkg_to_recv, pkg_to_send > 0, free_workers); MasterRecvPkgs(worker_list, pkg_to_recv, pkg_to_send > 0, free_workers);
@ -499,7 +529,8 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
/* unshuffle grid */ /* unshuffle grid */
// grid.importAndUnshuffle(mpi_buffer); // grid.importAndUnshuffle(mpi_buffer);
std::vector<double> out_vec{mpi_buffer}; std::vector<double> out_vec{mpi_buffer};
unshuffleField(mpi_buffer, this->n_cells, this->prop_count, wp_sizes_vector.size(), out_vec); unshuffleField(mpi_buffer, this->n_cells, this->prop_count, wp_sizes_vector.size(),
out_vec);
chem_field = out_vec; chem_field = out_vec;
/* do master stuff */ /* do master stuff */
@ -508,31 +539,42 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
std::cout << "[Master] Processing " << this->ctrl_batch.size() std::cout << "[Master] Processing " << this->ctrl_batch.size()
<< " control cells for comparison." << std::endl; << " control cells for comparison." << std::endl;
std::vector<std::vector<double>> sur_batch; std::vector<std::vector<double>> ctrl_vec;
sur_batch.reserve(this->ctrl_batch.size()); std::vector<std::vector<double>> sur_vec;
for (const auto &element : this->ctrl_batch) { extractCtrlSurVectors(this->ctrl_batch, mpi_buffer, this->n_cells, this->prop_count,
ctrl_vec, sur_vec);
/* using mpi-buffer because we need cell-major layout*/ // Diagnostics: iterate map correctly
for (size_t i = 0; i < this->n_cells; i++) { if (this->ctrl_batch.size() > this->ctrl_cell_ids.size()) {
uint32_t curr_cell_id = mpi_buffer[this->prop_count * i]; std::cout << "[Master] Warning: ctrl_batch (" << this->ctrl_batch.size()
<< ") is larger than ctrl_cell_ids (" << this->ctrl_cell_ids.size()
<< "). Dumping IDs and species concentrations:" << std::endl;
if (curr_cell_id == element[0]) { for (const auto &kv : this->ctrl_batch) {
std::vector<double> sur_output(mpi_buffer.begin() + this->prop_count * i, const uint32_t id = kv.first;
mpi_buffer.begin() + this->prop_count * (i + 1)); const auto &row = kv.second;
sur_batch.push_back(sur_output); if (row.empty())
break; continue;
// Print all columns with names, starting from k = 0
for (std::size_t k = 0; k < row.size(); ++k) {
std::string name = (k < prop_names.size())
? prop_names[k] // species_names/prop_names
: ("prop" + std::to_string(k)); // fallback
std::cout << name << "=" << row[k];
if (k + 1 < row.size())
std::cout << ", ";
} }
std::cout << std::endl;
} }
} }
metrics_a = MPI_Wtime(); metrics_a = MPI_Wtime();
control->computeMetrics(this->ctrl_batch, sur_batch, prop_names, ctrl_cell_ids.size(), control->computeMetrics(ctrl_vec, sur_vec, prop_names, ctrl_cell_ids.size(),
ctrl_file_out_dir); ctrl_file_out_dir);
metrics_b = MPI_Wtime(); metrics_b = MPI_Wtime();
this->metrics_t += metrics_b - metrics_a; this->metrics_t += metrics_b - metrics_a;
this->ctrl_batch.clear();
} }
/* start time measurement of master chemistry */ /* start time measurement of master chemistry */
@ -543,7 +585,12 @@ void poet::ChemistryModule::MasterRunParallel(double dt) {
this->seq_t += seq_d - seq_c; this->seq_t += seq_d - seq_c;
/* end time measurement of whole chemistry simulation */ /* end time measurement of whole chemistry simulation */
std::optional<uint32_t> target = control->findRbTarget(prop_names);
std::optional<uint32_t> target = std::nullopt;
if (!this->ctrl_batch.empty()) {
target = control->findRbTarget(prop_names);
}
int flush = target.has_value() ? 1 : 0; int flush = target.has_value() ? 1 : 0;
/* advise workers to end chemistry iteration */ /* advise workers to end chemistry iteration */
@ -566,8 +613,8 @@ void poet::ChemistryModule::MasterLoopBreak() {
MPI_Bcast(&type, 1, MPI_INT, 0, this->group_comm); MPI_Bcast(&type, 1, MPI_INT, 0, this->group_comm);
} }
std::vector<uint32_t> poet::ChemistryModule::CalculateWPSizesVector(uint32_t n_cells, std::vector<uint32_t>
uint32_t wp_size) const { poet::ChemistryModule::CalculateWPSizesVector(uint32_t n_cells, uint32_t wp_size) const {
bool mod_pkgs = (n_cells % wp_size) != 0; bool mod_pkgs = (n_cells % wp_size) != 0;
uint32_t n_packages = (uint32_t)(n_cells / wp_size) + static_cast<int>(mod_pkgs); uint32_t n_packages = (uint32_t)(n_cells / wp_size) + static_cast<int>(mod_pkgs);

View File

@ -13,6 +13,7 @@
#include <mpi.h> #include <mpi.h>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
namespace poet { namespace poet {
@ -93,7 +94,8 @@ void poet::ChemistryModule::WorkerLoop() {
} }
} }
void poet::ChemistryModule::WorkerProcessPkgs(struct worker_s &timings, uint32_t &iteration) { void poet::ChemistryModule::WorkerProcessPkgs(struct worker_s &timings,
uint32_t &iteration) {
MPI_Status probe_status; MPI_Status probe_status;
bool loop = true; bool loop = true;
@ -123,7 +125,8 @@ void poet::ChemistryModule::WorkerProcessPkgs(struct worker_s &timings, uint32_t
} }
} }
void poet::ChemistryModule::copyPkgs(const WorkPackage &wp, std::vector<double> &mpi_buffer) { void poet::ChemistryModule::copyPkgs(const WorkPackage &wp,
std::vector<double> &mpi_buffer) {
for (std::size_t wp_i = 0; wp_i < wp.size; wp_i++) { for (std::size_t wp_i = 0; wp_i < wp.size; wp_i++) {
std::copy(wp.output[wp_i].begin(), wp.output[wp_i].end(), std::copy(wp.output[wp_i].begin(), wp.output[wp_i].end(),
@ -153,8 +156,8 @@ void poet::ChemistryModule::processCtrlPkgs(std::vector<std::vector<double>> &in
copyPkgs(control_wp, mpi_buffer); copyPkgs(control_wp, mpi_buffer);
MPI_Request send_req; MPI_Request send_req;
MPI_Isend(mpi_buffer.data(), mpi_buffer.size(), MPI_DOUBLE, 0, LOOP_CTRL, MPI_COMM_WORLD, MPI_Isend(mpi_buffer.data(), mpi_buffer.size(), MPI_DOUBLE, 0, LOOP_CTRL,
&send_req); this->group_comm, &send_req);
MPI_Wait(&send_req, MPI_STATUS_IGNORE); MPI_Wait(&send_req, MPI_STATUS_IGNORE);
} }
@ -175,14 +178,12 @@ void poet::ChemistryModule::WorkerDoWork(MPI_Status &probe_status, int double_co
int flags; int flags;
std::vector<double> mpi_buffer(count); std::vector<double> mpi_buffer(count);
static int ctrl_cells_processed = 0;
static std::vector<std::vector<double>> ctrl_batch;
const int CL_INDEX = 7; const int CL_INDEX = 7;
const double CL_THRESHOLD = 1e-10; const double CL_THRESHOLD = 1e-10;
/* receive */ /* receive */
MPI_Recv(mpi_buffer.data(), count, MPI_DOUBLE, 0, LOOP_WORK, this->group_comm, MPI_STATUS_IGNORE); MPI_Recv(mpi_buffer.data(), count, MPI_DOUBLE, 0, LOOP_WORK, this->group_comm,
MPI_STATUS_IGNORE);
/* decrement count of work_package by BUFFER_OFFSET */ /* decrement count of work_package by BUFFER_OFFSET */
count -= BUFFER_OFFSET; count -= BUFFER_OFFSET;
@ -205,19 +206,18 @@ void poet::ChemistryModule::WorkerDoWork(MPI_Status &probe_status, int double_co
wp_start_index = mpi_buffer[count + 4]; wp_start_index = mpi_buffer[count + 4];
for (std::size_t wp_i = 0; wp_i < s_curr_wp.size; wp_i++) { for (std::size_t wp_i = 0; wp_i < s_curr_wp.size; wp_i++) {
s_curr_wp.input[wp_i] = std::vector<double>(mpi_buffer.begin() + this->prop_count * wp_i, s_curr_wp.input[wp_i] =
mpi_buffer.begin() + this->prop_count * (wp_i + 1)); std::vector<double>(mpi_buffer.begin() + this->prop_count * wp_i,
mpi_buffer.begin() + this->prop_count * (wp_i + 1));
} }
/* skip simulation of cells cells where Cl concentration is below threshold */ /* skip simulation of cells cells where Cl concentration is below threshold */
/*
for (std::size_t wp_i = 0; wp_i < s_curr_wp.size; wp_i++) { for (std::size_t wp_i = 0; wp_i < s_curr_wp.size; wp_i++) {
if (s_curr_wp.input[wp_i][CL_INDEX] < CL_THRESHOLD) { if (s_curr_wp.input[wp_i][CL_INDEX] < CL_THRESHOLD) {
s_curr_wp.mapping[wp_i] = CHEM_SKIP; s_curr_wp.mapping[wp_i] = CHEM_SKIP;
s_curr_wp.output[wp_i] = s_curr_wp.input[wp_i]; s_curr_wp.output[wp_i] = s_curr_wp.input[wp_i];
} }
} }
*/
// std::cout << this->comm_rank << ":" << counter++ << std::endl; // std::cout << this->comm_rank << ":" << counter++ << std::endl;
if (dht_enabled || interp_enabled || stab_enabled) { if (dht_enabled || interp_enabled || stab_enabled) {
@ -245,26 +245,29 @@ void poet::ChemistryModule::WorkerDoWork(MPI_Status &probe_status, int double_co
} }
} }
// if (!this->stab_enabled) { /* process cells to be monitored in a seperate workpackage */ if (!this->stab_enabled) {
for (std::size_t wp_i = 0; wp_i < s_curr_wp.size; wp_i++) { std::unordered_map<uint32_t, std::vector<double>> ctrl_cells_wp;
uint32_t cell_id = s_curr_wp.input[wp_i][0]; for (std::size_t wp_i = 0; wp_i < s_curr_wp.size; wp_i++) {
uint32_t id = static_cast<uint32_t>(s_curr_wp.input[wp_i][0]);
bool is_ctrl_cell = this->ctrl_cell_ids.find(cell_id) != this->ctrl_cell_ids.end(); bool is_ctrl_cell = this->ctrl_cell_ids.find(id) != this->ctrl_cell_ids.end();
bool used_sur = (s_curr_wp.mapping[wp_i] != CHEM_PQC) && (s_curr_wp.mapping[wp_i] != CHEM_SKIP); bool used_sur =
(s_curr_wp.mapping[wp_i] != CHEM_PQC) && (s_curr_wp.mapping[wp_i] != CHEM_SKIP);
if (is_ctrl_cell && used_sur) { if (is_ctrl_cell && used_sur) {
ctrl_batch.push_back(s_curr_wp.input[wp_i]); ctrl_cells_wp[id] = s_curr_wp.input[wp_i];
ctrl_cells_processed++;
if (ctrl_batch.size() == s_curr_wp.size ||
ctrl_cells_processed == this->ctrl_cell_ids.size()) {
processCtrlPkgs(ctrl_batch, current_sim_time, dt, timings);
ctrl_batch.clear();
ctrl_cells_processed = 0;
} }
} }
if (!ctrl_cells_wp.empty()) {
std::vector<std::vector<double>> batch;
batch.reserve(ctrl_cells_wp.size());
for (auto &kv : ctrl_cells_wp) {
batch.push_back(kv.second);
}
processCtrlPkgs(batch, current_sim_time, dt, timings);
}
} }
// }
phreeqc_time_start = MPI_Wtime(); phreeqc_time_start = MPI_Wtime();
@ -276,7 +279,8 @@ void poet::ChemistryModule::WorkerDoWork(MPI_Status &probe_status, int double_co
/* send results to master */ /* send results to master */
MPI_Request send_req; MPI_Request send_req;
MPI_Isend(mpi_buffer.data(), count, MPI_DOUBLE, 0, LOOP_WORK, MPI_COMM_WORLD, &send_req); MPI_Isend(mpi_buffer.data(), count, MPI_DOUBLE, 0, LOOP_WORK, this->group_comm,
&send_req);
if (dht_enabled || interp_enabled || stab_enabled) { if (dht_enabled || interp_enabled || stab_enabled) {
/* write results to DHT */ /* write results to DHT */
@ -299,8 +303,8 @@ void poet::ChemistryModule::WorkerPostIter(MPI_Status &probe_status, uint32_t it
MPI_Get_count(&probe_status, MPI_INT, &size); MPI_Get_count(&probe_status, MPI_INT, &size);
if (size == 1) { if (size == 1) {
MPI_Recv(&flush_request, size, MPI_INT, probe_status.MPI_SOURCE, LOOP_END, this->group_comm, MPI_Recv(&flush_request, size, MPI_INT, probe_status.MPI_SOURCE, LOOP_END,
MPI_STATUS_IGNORE); this->group_comm, MPI_STATUS_IGNORE);
} else { } else {
MPI_Recv(NULL, 0, MPI_INT, probe_status.MPI_SOURCE, LOOP_END, this->group_comm, MPI_Recv(NULL, 0, MPI_INT, probe_status.MPI_SOURCE, LOOP_END, this->group_comm,
MPI_STATUS_IGNORE); MPI_STATUS_IGNORE);
@ -322,8 +326,8 @@ void poet::ChemistryModule::WorkerPostIter(MPI_Status &probe_status, uint32_t it
interp->resetCounter(); interp->resetCounter();
interp->writePHTStats(); interp->writePHTStats();
if (this->dht_snaps_type == DHT_SNAPS_ITEREND) { if (this->dht_snaps_type == DHT_SNAPS_ITEREND) {
out << this->dht_file_out_dir << "/iter_" << std::setfill('0') << std::setw(this->file_pad) out << this->dht_file_out_dir << "/iter_" << std::setfill('0')
<< iteration << ".pht"; << std::setw(this->file_pad) << iteration << ".pht";
interp->dumpPHTState(out.str()); interp->dumpPHTState(out.str());
} }
@ -347,16 +351,16 @@ void poet::ChemistryModule::WorkerPostSim(uint32_t iteration) {
} }
if (this->interp_enabled && this->dht_snaps_type >= DHT_SNAPS_ITEREND) { if (this->interp_enabled && this->dht_snaps_type >= DHT_SNAPS_ITEREND) {
std::stringstream out; std::stringstream out;
out << this->dht_file_out_dir << "/iter_" << std::setfill('0') << std::setw(this->file_pad) out << this->dht_file_out_dir << "/iter_" << std::setfill('0')
<< iteration << ".pht"; << std::setw(this->file_pad) << iteration << ".pht";
interp->dumpPHTState(out.str()); interp->dumpPHTState(out.str());
} }
} }
void poet::ChemistryModule::WorkerWriteDHTDump(uint32_t iteration) { void poet::ChemistryModule::WorkerWriteDHTDump(uint32_t iteration) {
std::stringstream out; std::stringstream out;
out << this->dht_file_out_dir << "/iter_" << std::setfill('0') << std::setw(this->file_pad) out << this->dht_file_out_dir << "/iter_" << std::setfill('0')
<< iteration << ".dht"; << std::setw(this->file_pad) << iteration << ".dht";
int res = dht->tableToFile(out.str().c_str()); int res = dht->tableToFile(out.str().c_str());
if (res != DHT_SUCCESS && this->comm_rank == 2) if (res != DHT_SUCCESS && this->comm_rank == 2)
std::cerr << "CPP: Worker: Error in writing current state of DHT to file.\n"; std::cerr << "CPP: Worker: Error in writing current state of DHT to file.\n";
@ -377,13 +381,13 @@ void poet::ChemistryModule::WorkerReadDHTDump(const std::string &dht_input_file)
} }
} else { } else {
if (this->comm_rank == 2) if (this->comm_rank == 2)
std::cout << "CPP: Worker: Successfully loaded state of DHT from file " << dht_input_file std::cout << "CPP: Worker: Successfully loaded state of DHT from file "
<< "\n"; << dht_input_file << "\n";
} }
} }
void poet::ChemistryModule::WorkerRunWorkPackage(WorkPackage &work_package, double dSimTime, void poet::ChemistryModule::WorkerRunWorkPackage(WorkPackage &work_package,
double dTimestep) { double dSimTime, double dTimestep) {
std::vector<std::vector<double>> inout_chem = work_package.input; std::vector<std::vector<double>> inout_chem = work_package.input;
std::vector<std::size_t> to_ignore; std::vector<std::size_t> to_ignore;
@ -399,7 +403,8 @@ void poet::ChemistryModule::WorkerRunWorkPackage(WorkPackage &work_package, doub
this->pqc_runner->run(inout_chem, dTimestep, to_ignore); this->pqc_runner->run(inout_chem, dTimestep, to_ignore);
//std::cout << "Ignored " << to_ignore.size() << " cells out of " << wp_size << "." << std::endl; // std::cout << "Ignored " << to_ignore.size() << " cells out of " << wp_size << "." <<
// std::endl;
for (std::size_t wp_id = 0; wp_id < work_package.size; wp_id++) { for (std::size_t wp_id = 0; wp_id < work_package.size; wp_id++) {
if (work_package.mapping[wp_id] == CHEM_PQC) { if (work_package.mapping[wp_id] == CHEM_PQC) {
@ -415,7 +420,8 @@ void poet::ChemistryModule::WorkerRunWorkPackage(WorkPackage &work_package, doub
void poet::ChemistryModule::WorkerPerfToMaster(int type, const struct worker_s &timings) { void poet::ChemistryModule::WorkerPerfToMaster(int type, const struct worker_s &timings) {
switch (type) { switch (type) {
case WORKER_PHREEQC: { case WORKER_PHREEQC: {
MPI_Gather(&timings.phreeqc_t, 1, MPI_DOUBLE, NULL, 1, MPI_DOUBLE, 0, this->group_comm); MPI_Gather(&timings.phreeqc_t, 1, MPI_DOUBLE, NULL, 1, MPI_DOUBLE, 0,
this->group_comm);
break; break;
} }
case WORKER_CTRL_ITER: { case WORKER_CTRL_ITER: {
@ -427,7 +433,8 @@ void poet::ChemistryModule::WorkerPerfToMaster(int type, const struct worker_s &
break; break;
} }
case WORKER_DHT_FILL: { case WORKER_DHT_FILL: {
MPI_Gather(&timings.dht_fill, 1, MPI_DOUBLE, NULL, 1, MPI_DOUBLE, 0, this->group_comm); MPI_Gather(&timings.dht_fill, 1, MPI_DOUBLE, NULL, 1, MPI_DOUBLE, 0,
this->group_comm);
break; break;
} }
case WORKER_IDLE: { case WORKER_IDLE: {
@ -470,8 +477,8 @@ void poet::ChemistryModule::WorkerMetricsToMaster(int type) {
auto reduce_and_send = [&worker_rank, &worker_comm, auto reduce_and_send = [&worker_rank, &worker_comm,
&group_comm](std::vector<std::uint32_t> &send_buffer, int tag) { &group_comm](std::vector<std::uint32_t> &send_buffer, int tag) {
std::vector<uint32_t> to_master(send_buffer.size()); std::vector<uint32_t> to_master(send_buffer.size());
MPI_Reduce(send_buffer.data(), to_master.data(), send_buffer.size(), MPI_UINT32_T, MPI_SUM, 0, MPI_Reduce(send_buffer.data(), to_master.data(), send_buffer.size(), MPI_UINT32_T,
worker_comm); MPI_SUM, 0, worker_comm);
if (worker_rank == 0) { if (worker_rank == 0) {
MPI_Send(to_master.data(), to_master.size(), MPI_UINT32_T, 0, tag, group_comm); MPI_Send(to_master.data(), to_master.size(), MPI_UINT32_T, 0, tag, group_comm);

View File

@ -5,26 +5,22 @@
#include <cmath> #include <cmath>
#include <numeric> #include <numeric>
poet::ControlModule::ControlModule(const ControlConfig &config_) : config(config_) {} poet::ControlModule::ControlModule(const ControlConfig &config_, ChemistryModule &chem)
: config(config_), chem_(chem) {}
void poet::ControlModule::beginIteration(ChemistryModule &chem, uint32_t &iter,
const bool &dht_enabled, const bool &interp_enabled) {
void poet::ControlModule::beginIteration(uint32_t &iter, const bool &dht_enabled,
const bool &interp_enabled) {
global_iter = iter; global_iter = iter;
double prep_a, prep_b; double prep_a, prep_b;
prep_a = MPI_Wtime(); prep_a = MPI_Wtime();
updateSurrState(dht_enabled, interp_enabled);
updateSurrState(chem, dht_enabled, interp_enabled);
prep_b = MPI_Wtime(); prep_b = MPI_Wtime();
this->prep_t += prep_b - prep_a; this->prep_t += prep_b - prep_a;
} }
/* Disables dht and/or interp during stabilzation phase */ void poet::ControlModule::updateSurrState(bool dht_enabled, bool interp_enabled) {
void poet::ControlModule::updateSurrState(ChemistryModule &chem, bool dht_enabled,
bool interp_enabled) {
bool in_warmup = (global_iter <= config.stab_interval); bool in_warmup = (global_iter <= config.stab_interval);
bool rb_limit_reached = (rb_count >= config.rb_limit); bool rb_limit_reached = rbLimitReached();
if (rb_enabled && stab_countdown > 0) { if (rb_enabled && stab_countdown > 0) {
--stab_countdown; --stab_countdown;
@ -32,48 +28,56 @@ void poet::ControlModule::updateSurrState(ChemistryModule &chem, bool dht_enable
if (stab_countdown == 0) { if (stab_countdown == 0) {
rb_enabled = false; rb_enabled = false;
} }
flush_request = false;
} }
/* disables surrogate during warmup, active rollback or after rollback limit is reached
/* disable surrogates during warmup, active rollback or after limit */ */
if (in_warmup || rb_enabled || rb_limit_reached) { if (in_warmup || rb_enabled || rb_limit_reached) {
chem.SetStabEnabled(!rb_limit_reached); chem_.SetStabEnabled(!rb_limit_reached);
chem.SetDhtEnabled(false); chem_.SetDhtEnabled(false);
chem.SetInterpEnabled(false); chem_.SetInterpEnabled(false);
if (rb_limit_reached) { if (rb_limit_reached) {
std::cout << "Interpolation completly disabled." << std::endl; std::cout << "Interpolation completely disabled." << std::endl;
} else { } else {
std::cout << "In stabilization phase." << std::endl; std::cout << "In stabilization phase." << std::endl;
} }
return; return;
} }
if (rb_count > 0 && !rb_enabled && !in_warmup) {
surr_active++;
if (surr_active > config.rb_interval_limit) {
surr_active = 0;
rb_count -= 1;
std::cout << "Rollback count reset to: " << rb_count << "." << std::endl;
}
}
/* enable user-requested surrogates */ /* enable user-requested surrogates */
chem.SetStabEnabled(false); chem_.SetStabEnabled(false);
chem.SetDhtEnabled(dht_enabled); chem_.SetDhtEnabled(dht_enabled);
chem.SetInterpEnabled(interp_enabled); chem_.SetInterpEnabled(interp_enabled);
std::cout << "Interpolating." << std::endl; std::cout << "Interpolating." << std::endl;
} }
void poet::ControlModule::writeCheckpoint(DiffusionModule &diffusion, uint32_t &iter, void poet::ControlModule::writeCheckpoint(uint32_t &iter, const std::string &out_dir) {
const std::string &out_dir) {
if (global_iter % config.chkpt_interval == 0) { if (global_iter % config.chkpt_interval == 0) {
double w_check_a = MPI_Wtime(); double w_check_a = MPI_Wtime();
write_checkpoint(out_dir, "checkpoint" + std::to_string(iter) + ".hdf5", write_checkpoint(out_dir, "checkpoint" + std::to_string(iter) + ".hdf5",
{.field = diffusion.getField(), .iteration = iter}); {.field = chem_.getField(), .iteration = iter});
double w_check_b = MPI_Wtime(); double w_check_b = MPI_Wtime();
this->w_check_t += w_check_b - w_check_a; this->w_check_t += w_check_b - w_check_a;
} }
} }
void poet::ControlModule::readCheckpoint(DiffusionModule &diffusion, uint32_t &current_iter, void poet::ControlModule::readCheckpoint(uint32_t &current_iter, uint32_t rollback_iter,
uint32_t rollback_iter, const std::string &out_dir) { const std::string &out_dir) {
double r_check_a, r_check_b; double r_check_a, r_check_b;
r_check_a = MPI_Wtime(); r_check_a = MPI_Wtime();
Checkpoint_s checkpoint_read{.field = diffusion.getField()}; Checkpoint_s checkpoint_read{.field = chem_.getField()};
read_checkpoint(out_dir, "checkpoint" + std::to_string(rollback_iter) + ".hdf5", checkpoint_read); read_checkpoint(out_dir, "checkpoint" + std::to_string(rollback_iter) + ".hdf5",
checkpoint_read);
current_iter = checkpoint_read.iteration; current_iter = checkpoint_read.iteration;
r_check_b = MPI_Wtime(); r_check_b = MPI_Wtime();
r_check_t += r_check_b - r_check_a; r_check_t += r_check_b - r_check_a;
@ -81,168 +85,91 @@ void poet::ControlModule::readCheckpoint(DiffusionModule &diffusion, uint32_t &c
void poet::ControlModule::writeMetrics(uint32_t &iter, const std::string &out_dir, void poet::ControlModule::writeMetrics(uint32_t &iter, const std::string &out_dir,
const std::vector<std::string> &species) { const std::vector<std::string> &species) {
if (rb_count >= config.rb_limit || global_iter <= config.stab_interval) {
return;
}
if (rb_enabled) {
return;
}
double stats_a = MPI_Wtime(); double stats_a = MPI_Wtime();
writeSpeciesStatsToCSV(s_history, species, out_dir, "species_overview.csv"); writeSpeciesStatsToCSV(s_history, species, out_dir, "species_overview.csv");
write_metrics(c_history, species, out_dir, "metrics_overview.hdf5"); if (global_iter % (config.chkpt_interval / 2) == 0) {
write_metrics(c_history, species, out_dir, "metrics_overview.hdf5");
}
double stats_b = MPI_Wtime(); double stats_b = MPI_Wtime();
this->stats_t += stats_b - stats_a; this->stats_t += stats_b - stats_a;
} }
uint32_t poet::ControlModule::calcRbIter() { uint32_t poet::ControlModule::calcRbIter() {
uint32_t last_iter = ((global_iter - 1) / config.chkpt_interval) * config.chkpt_interval; uint32_t last_iter =
((global_iter - 1) / config.chkpt_interval) * config.chkpt_interval;
return last_iter; return last_iter;
} }
std::optional<uint32_t> poet::ControlModule::findRbTarget(const std::vector<std::string> &species) { std::optional<uint32_t>
poet::ControlModule::findRbTarget(const std::vector<std::string> &species) {
double r_check_a, r_check_b;
/* Skip threshold checking if already in stabilization phase*/ /* Skip threshold checking if already in stabilization phase*/
if (s_history.empty() || rb_enabled) { if (s_history.empty() || rb_enabled) {
return std::nullopt; return std::nullopt;
} }
const auto &s_hist = s_history.back(); const auto &s_hist = s_history.back();
/* skipping cell_id and id */ /* skipping cell_id and id */
for (size_t sp_i = 2; sp_i < species.size(); sp_i++) { for (size_t sp_idx = 2; sp_idx < species.size(); sp_idx++) {
/* skip Charge */
/* check bounds of threshold vector*/ if (sp_idx == 4) {
if (sp_i >= config.mape_threshold.size()) {
std::cerr << "Warning: No threshold defined for species " << species[sp_i] << " at index "
<< std::to_string(sp_i) << std::endl;
continue; continue;
} }
if (s_hist.mape[sp_i] > config.mape_threshold[sp_i]) { if (sp_idx >= config.mape_threshold.size()) {
std::cerr << "Warning: No threshold defined for species " << species[sp_idx]
const auto &c_hist = c_history.back(); << " at index " << std::to_string(sp_idx) << std::endl;
auto max_it = continue;
std::max_element(c_hist.mape.begin(), c_hist.mape.end(), }
[sp_i](const auto &a, const auto &b) { return a[sp_i] < b[sp_i]; }); const double sp_mape = s_hist.mape[sp_idx];
const double sp_thr = config.mape_threshold[sp_idx];
size_t max_idx = std::distance(c_hist.mape.begin(), max_it);
uint32_t cell_id = c_hist.id[max_idx];
double cell_mape = (*max_it)[sp_i];
if (sp_mape > sp_thr) {
flush_request = true; flush_request = true;
std::cout << "Species " << species[sp_idx]
<< " exceeded threshold (MAPE=" << sp_mape << " > thr=" << sp_thr
<< "). Offending cells:" << std::endl;
std::cout << "Threshold exceeded for " << species[sp_i] if (!c_history.empty()) {
<< " with species-level MAPE = " << std::to_string(s_hist.mape[sp_i]) const auto &c_hist = c_history.back();
<< " exceeding threshold = " << std::to_string(config.mape_threshold[sp_i]) printExceedingCells(c_hist, sp_idx, sp_thr);
<< ". Worst cell: ID=" << std::to_string(cell_id) }
<< " with MAPE=" << std::to_string(cell_mape) << std::endl;
return calcRbIter(); return calcRbIter();
} }
} }
return std::nullopt; return std::nullopt;
} }
void poet::ControlModule::computeMetrics(std::vector<std::vector<double>> &reference_values, void poet::ControlModule::printExceedingCells(const CellMetrics &c_hist, size_t sp_idx,
std::vector<std::vector<double>> &surrogate_values, double sp_thr) {
const std::vector<std::string> &species, for (size_t cell_idx = 0; cell_idx < c_hist.mape.size(); ++cell_idx) {
const uint32_t size_per_prop, const std::string &out_dir) { const double mape = c_hist.mape[cell_idx][sp_idx];
if (rb_count >= config.rb_limit) { if (mape > sp_thr) {
return; const uint32_t id = c_hist.id[cell_idx];
std::cout << " cell_id=" << id << " mape=" << mape << std::endl;
}
} }
}
const uint32_t n_cells = reference_values.size(); void poet::ControlModule::computeMetrics(std::vector<std::vector<double>> &ref_values,
std::vector<std::vector<double>> &sur_values,
const std::vector<std::string> &species,
const uint32_t size_per_prop,
const std::string &out_dir) {
std::cout << "DEBUG: computeMetrics called at iteration " << global_iter
<< ", rb_count=" << rb_count << "/" << config.rb_limit << std::endl;
const uint32_t n_cells = ref_values.size();
const uint32_t n_species = species.size(); const uint32_t n_species = species.size();
const double ZERO_ABS = config.zero_abs;
CellMetrics c_metrics(n_cells, n_species, global_iter, rb_count); CellMetrics c_metrics(n_cells, n_species, global_iter, rb_count);
SpeciesMetrics s_metrics(n_species, global_iter, rb_count); SpeciesMetrics s_metrics(n_species, global_iter, rb_count);
std::vector<double> species_err_sum(n_species, 0.0); /* compute cell-level metrics */
std::vector<double> species_sqr_sum(n_species, 0.0); computeCellMetrics(ref_values, sur_values, c_metrics);
for (size_t cell_i = 0; cell_i < n_cells; cell_i++) { /* compute species-level metrics */
computeSpeciesMetrics(ref_values, sur_values, species, size_per_prop, s_metrics);
c_metrics.id.push_back(reference_values[cell_i][0]);
c_metrics.mape[cell_i][0] = reference_values[cell_i][0];
c_metrics.rrmse[cell_i][0] = reference_values[cell_i][0];
for (size_t sp_i = 2; sp_i < n_species; sp_i++) {
const double ref_value = reference_values[cell_i][sp_i];
const double sur_value = surrogate_values[cell_i][sp_i];
if (std::isnan(ref_value) || std::isnan(sur_value)) {
// Initialize to 0 for NaN cases to avoid uninitialized values
c_metrics.mape[cell_i][sp_i] = 0.0;
c_metrics.rrmse[cell_i][sp_i] = 0.0;
std::cout << "WARNING: NaN detected - Cell=" << reference_values[cell_i][0]
<< ", Species=" << species[sp_i]
<< ", Ref=" << (std::isnan(ref_value) ? "NaN" : std::to_string(ref_value))
<< ", Sur=" << (std::isnan(sur_value) ? "NaN" : std::to_string(sur_value))
<< std::endl;
continue;
}
if (std::abs(ref_value) < ZERO_ABS) {
if (std::abs(sur_value) >= ZERO_ABS) {
species_err_sum[sp_i] += 1.0;
species_sqr_sum[sp_i] += 1.0;
c_metrics.mape[cell_i][sp_i] = 100.0;
c_metrics.rrmse[cell_i][sp_i] = 1.0;
} else {
// Both values are near zero, initialize to 0
c_metrics.mape[cell_i][sp_i] = 0.0;
c_metrics.rrmse[cell_i][sp_i] = 0.0;
}
} else {
double alpha = 1.0 - (sur_value / ref_value);
species_err_sum[sp_i] += std::abs(alpha);
species_sqr_sum[sp_i] += alpha * alpha;
c_metrics.mape[cell_i][sp_i] = 100.0 * std::abs(alpha);
c_metrics.rrmse[cell_i][sp_i] = alpha * alpha;
// Log extreme MAPE values for debugging
if (c_metrics.mape[cell_i][sp_i] > 100.0) {
std::cout << "WARNING: High MAPE detected - Cell=" << c_metrics.id[cell_i]
<< ", Species=" << species[sp_i] << ", MAPE=" << c_metrics.mape[cell_i][sp_i]
<< "%, Ref=" << ref_value << ", Sur=" << sur_value << ", Alpha=" << alpha
<< std::endl;
}
}
}
}
for (size_t sp_i = 2; sp_i < n_species; sp_i++) {
s_metrics.mape[sp_i] = 100.0 * (species_err_sum[sp_i] / size_per_prop);
s_metrics.rrmse[sp_i] = std::sqrt(species_sqr_sum[sp_i] / size_per_prop);
}
// Sort cell metrics by ID
std::vector<size_t> indices(n_cells);
std::iota(indices.begin(), indices.end(), 0);
std::sort(indices.begin(), indices.end(),
[&c_metrics](size_t a, size_t b) { return c_metrics.id[a] < c_metrics.id[b]; });
// Reorder cell metrics based on sorted indices
std::vector<uint32_t> sorted_ids(n_cells);
std::vector<std::vector<double>> sorted_mape(n_cells, std::vector<double>(n_species));
std::vector<std::vector<double>> sorted_rrmse(n_cells, std::vector<double>(n_species));
for (size_t i = 0; i < n_cells; i++) {
sorted_ids[i] = c_metrics.id[indices[i]];
sorted_mape[i] = c_metrics.mape[indices[i]];
sorted_rrmse[i] = c_metrics.rrmse[indices[i]];
}
c_metrics.id = std::move(sorted_ids);
c_metrics.mape = std::move(sorted_mape);
c_metrics.rrmse = std::move(sorted_rrmse);
s_history.push_back(s_metrics); s_history.push_back(s_metrics);
c_history.push_back(c_metrics); c_history.push_back(c_metrics);
@ -250,30 +177,105 @@ void poet::ControlModule::computeMetrics(std::vector<std::vector<double>> &refer
writeMetrics(global_iter, out_dir, species); writeMetrics(global_iter, out_dir, species);
} }
void poet::ControlModule::processCheckpoint(DiffusionModule &diffusion, uint32_t &current_iter, inline double poet::ControlModule::computeAlpha(double ref, double sur) const {
const std::string &out_dir, const double zero_abs = config.zero_abs;
const std::vector<std::string> &species) { if (std::isnan(ref) || std::isnan(sur)) {
if (rb_count >= config.rb_limit) { return 0.0;
}
if (std::abs(ref) < zero_abs) {
return (std::abs(sur) < zero_abs) ? 0.0 : 1.0;
}
return 1.0 - (sur / ref);
}
void poet::ControlModule::computeSpeciesMetrics(
const std::vector<std::vector<double>> &ref_values,
const std::vector<std::vector<double>> &sur_values,
const std::vector<std::string> &species, uint32_t size_per_prop,
SpeciesMetrics &s_metrics) {
const size_t n_cells = ref_values.size();
const size_t n_species = species.size();
std::vector<double> err_sum(n_species, 0.0);
std::vector<double> sqr_sum(n_species, 0.0);
for (size_t cell_idx = 0; cell_idx < n_cells; cell_idx++) {
for (size_t sp_idx = 2; sp_idx < n_species; sp_idx++) {
const double alpha =
computeAlpha(ref_values[cell_idx][sp_idx], sur_values[cell_idx][sp_idx]);
err_sum[sp_idx] += std::abs(alpha);
sqr_sum[sp_idx] += alpha * alpha;
}
}
for (size_t sp_idx = 2; sp_idx < n_species; sp_idx++) {
s_metrics.mape[sp_idx] =
100.0 * (err_sum[sp_idx] / static_cast<double>(size_per_prop));
s_metrics.rrmse[sp_idx] =
std::sqrt(sqr_sum[sp_idx] / static_cast<double>(size_per_prop));
}
}
void poet::ControlModule::computeCellMetrics(
const std::vector<std::vector<double>> &ref_values,
const std::vector<std::vector<double>> &sur_values, CellMetrics &c_metrics) {
const size_t n_cells = ref_values.size();
// Use ref_values to get n_species instead of accessing potentially uninitialized mape
const size_t n_species = (n_cells > 0) ? ref_values[0].size() : 0;
if (n_cells == 0 || n_species == 0) {
std::cerr << "Warning: computeCellMetrics called with empty data" << std::endl;
return; return;
} }
if (flush_request && rb_count < config.rb_limit) {
for (size_t cell_idx = 0; cell_idx < n_cells; ++cell_idx) {
// Assign the per-cell id correctly
c_metrics.id[cell_idx] = static_cast<uint32_t>(ref_values[cell_idx][0]);
for (size_t sp_idx = 2; sp_idx < n_species; ++sp_idx) {
const double alpha =
computeAlpha(ref_values[cell_idx][sp_idx], sur_values[cell_idx][sp_idx]);
c_metrics.mape[cell_idx][sp_idx] = 100.0 * std::abs(alpha);
c_metrics.rrmse[cell_idx][sp_idx] = alpha * alpha;
c_metrics.conc[cell_idx][sp_idx] = ref_values[cell_idx][sp_idx];
}
}
/* cell_ID and ID columns */
}
void poet::ControlModule::processCheckpoint(uint32_t &current_iter,
const std::string &out_dir,
const std::vector<std::string> &species) {
if (rbLimitReached()) {
return;
}
if (flush_request) {
uint32_t target = calcRbIter(); uint32_t target = calcRbIter();
readCheckpoint(diffusion, current_iter, target, out_dir); readCheckpoint(current_iter, target, out_dir);
rb_enabled = true; rb_enabled = true;
rb_count++; rb_count++;
stab_countdown = config.stab_interval; stab_countdown = config.stab_interval;
flush_request = false;
std::cout << "Restored checkpoint " << std::to_string(target) << ", surrogate disabled for " std::cout << "Restored checkpoint " << std::to_string(target)
<< std::to_string(config.stab_interval) << std::endl; << ", surrogate disabled for " << std::to_string(config.stab_interval)
<< std::endl;
} else { } else {
writeCheckpoint(diffusion, global_iter, out_dir); writeCheckpoint(global_iter, out_dir);
} }
} }
bool poet::ControlModule::needsFlagBcast() const { bool poet::ControlModule::needsFlagBcast() const {
if (rb_count >= config.rb_limit) { return (config.rb_limit > 0) && !rbLimitReached();
return false;
}
return true;
} }
inline bool poet::ControlModule::rbLimitReached() const {
/* rollback is completly disabled */
if (config.rb_limit == 0)
return false;
return rb_count >= config.rb_limit;
}

View File

@ -4,7 +4,6 @@
#include "Base/Macros.hpp" #include "Base/Macros.hpp"
#include "Chemistry/ChemistryModule.hpp" #include "Chemistry/ChemistryModule.hpp"
#include "IO/HDF5Functions.hpp" #include "IO/HDF5Functions.hpp"
#include "Transport/DiffusionModule.hpp"
#include "poet.hpp" #include "poet.hpp"
#include <cstdint> #include <cstdint>
@ -21,6 +20,7 @@ struct ControlConfig {
uint32_t stab_interval = 0; uint32_t stab_interval = 0;
uint32_t chkpt_interval = 0; uint32_t chkpt_interval = 0;
uint32_t rb_limit = 0; uint32_t rb_limit = 0;
uint32_t rb_interval_limit = 0;
double zero_abs = 0.0; double zero_abs = 0.0;
std::vector<double> mape_threshold; std::vector<double> mape_threshold;
}; };
@ -29,12 +29,15 @@ struct CellMetrics {
std::vector<std::uint32_t> id; std::vector<std::uint32_t> id;
std::vector<std::vector<double>> mape; std::vector<std::vector<double>> mape;
std::vector<std::vector<double>> rrmse; std::vector<std::vector<double>> rrmse;
std::vector<std::vector<double>> conc;
uint32_t iteration = 0; uint32_t iteration = 0;
uint32_t rb_count = 0; uint32_t rb_count = 0;
CellMetrics(uint32_t n_cells, uint32_t n_species, uint32_t iter, uint32_t rb_count) CellMetrics(uint32_t n_cells, uint32_t n_species, uint32_t iter, uint32_t rb_count)
: mape(n_cells, std::vector<double>(n_species, 0.0)), : id(n_cells, 0), mape(n_cells, std::vector<double>(n_species, 0.0)),
rrmse(n_cells, std::vector<double>(n_species, 0.0)), iteration(iter), rb_count(rb_count) {} rrmse(n_cells, std::vector<double>(n_species, 0.0)),
conc(n_cells, std::vector<double>(n_species, 0.0)), iteration(iter),
rb_count(rb_count) {}
}; };
struct SpeciesMetrics { struct SpeciesMetrics {
@ -50,22 +53,24 @@ struct SpeciesMetrics {
class ControlModule { class ControlModule {
public: public:
explicit ControlModule(const ControlConfig &config); explicit ControlModule(const ControlConfig &config, ChemistryModule &chem);
void beginIteration(ChemistryModule &chem, uint32_t &iter, const bool &dht_enabled, /* store global iteration, dht and pht settings */
const bool &interp_enaled); void beginIteration(uint32_t &iter, const bool &dht_enabled, const bool &interp_enaled);
void writeMetrics(uint32_t &iter, const std::string &out_dir, void writeMetrics(uint32_t &iter, const std::string &out_dir,
const std::vector<std::string> &species); const std::vector<std::string> &species);
void computeMetrics(std::vector<std::vector<double>> &reference_values, void computeMetrics(std::vector<std::vector<double>> &reference_values,
std::vector<std::vector<double>> &surrogate_values, std::vector<std::vector<double>> &surrogate_values,
const std::vector<std::string> &species, const uint32_t size_per_prop, const std::vector<std::string> &species,
const std::string &out_dir); const uint32_t size_per_prop, const std::string &out_dir);
void processCheckpoint(DiffusionModule &diffusion, uint32_t &current_iter, void processCheckpoint(uint32_t &current_iter, const std::string &out_dir,
const std::string &out_dir, const std::vector<std::string> &species); const std::vector<std::string> &species);
/* Returnn rollback target or nullopt and sets flush_request when threshold is exceeded.
It reports the cells which exceed the threshold. */
std::optional<uint32_t> findRbTarget(const std::vector<std::string> &species); std::optional<uint32_t> findRbTarget(const std::vector<std::string> &species);
bool getFlushRequest() const { return flush_request; } bool getFlushRequest() const { return flush_request; }
@ -88,20 +93,38 @@ public:
auto getMetricsWriteTime() const { return stats_t; } auto getMetricsWriteTime() const { return stats_t; }
private: private:
void updateSurrState(ChemistryModule &chem, bool dht_enabled, bool interp_enabled); void updateSurrState(bool dht_enabled, bool interp_enabled);
void readCheckpoint(DiffusionModule &diffusion, uint32_t &current_iter, uint32_t rollback_iter, void readCheckpoint(uint32_t &current_iter, uint32_t rollback_iter,
const std::string &out_dir); const std::string &out_dir);
void writeCheckpoint(DiffusionModule &diffusion, uint32_t &iter, const std::string &out_dir); void writeCheckpoint(uint32_t &iter, const std::string &out_dir);
uint32_t calcRbIter(); uint32_t calcRbIter();
void computeSpeciesMetrics(const std::vector<std::vector<double>> &ref_values,
const std::vector<std::vector<double>> &sur_values,
const std::vector<std::string> &species,
uint32_t size_per_prop, SpeciesMetrics &s_metrics);
void computeCellMetrics(const std::vector<std::vector<double>> &ref_values,
const std::vector<std::vector<double>> &sur_values,
CellMetrics &c_metrics);
inline double computeAlpha(double ref, double sur) const;
static void printExceedingCells(const CellMetrics &c_hist, size_t sp_idx,
double sp_thr);
inline bool rbLimitReached() const;
ControlConfig config; ControlConfig config;
ChemistryModule &chem_;
std::uint32_t global_iter = 0; std::uint32_t global_iter = 0;
std::uint32_t rb_count = 0; std::uint32_t rb_count = 0;
std::uint32_t rb_limit = 0; std::uint32_t rb_limit = 0;
std::uint32_t stab_countdown = 0; std::uint32_t stab_countdown = 0;
std::uint32_t surr_active = 0;
std::vector<uint32_t> ctrl_cell_ids; std::vector<uint32_t> ctrl_cell_ids;
bool rb_enabled = false; bool rb_enabled = false;

View File

@ -8,7 +8,9 @@
#include <highfive/highfive.hpp> #include <highfive/highfive.hpp>
#include <iomanip> #include <iomanip>
#include <iostream> #include <iostream>
#include <limits>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
namespace fs = std::filesystem; namespace fs = std::filesystem;
@ -16,65 +18,98 @@ namespace fs = std::filesystem;
int write_metrics(const std::vector<poet::CellMetrics> &metrics_history, int write_metrics(const std::vector<poet::CellMetrics> &metrics_history,
const std::vector<std::string> &species_names, const std::vector<std::string> &species_names,
const std::string &dir_path, const std::string &file_name) { const std::string &dir_path, const std::string &file_name) {
if (!fs::exists(dir_path)) { if (!fs::exists(dir_path)) {
std::cerr << "Directory does not exist: " << dir_path << std::endl; std::cerr << "Directory does not exist: " << dir_path << std::endl;
return -1; return -1;
} }
fs::path file_path = fs::path(dir_path) / file_name; fs::path file_path = fs::path(dir_path) / file_name;
// Use a std::string path to avoid filesystem path conversion issues. const std::size_t n_species = species_names.size();
H5Easy::File file(file_path.string(), H5Easy::File::Truncate); if (n_species == 0) {
std::cerr << "Species names list is empty; skipping HDF5 metrics dump.\n";
return -1;
}
for (size_t idx = 0; idx < metrics_history.size(); ++idx) { // Always start from a clean file: no stale datasets, no mismatched shapes.
const auto &metrics = metrics_history[idx]; HighFive::File file(file_path.string(), HighFive::File::Truncate);
/*
std::string grp = "entry_" + std::to_string(idx) + "_iter_" +
std::to_string(metrics.iteration) + "_rb_" +
std::to_string(metrics.rollback_count);
*/
std::string grp = "iter_" + std::to_string(metrics.iteration) + "_rb_" +
std::to_string(metrics.rb_count);
size_t n_cells = metrics.id.size(); // cell_id -> all rows over time
// Use provided species_names as the source of truth to avoid OOB when mape is empty. using Row = std::vector<double>;
size_t n_species = species_names.size(); using Matrix = std::vector<Row>;
std::unordered_map<std::uint32_t, Matrix> mape_per_cell;
std::unordered_map<std::uint32_t, Matrix> rrmse_per_cell;
std::unordered_map<std::uint32_t, Matrix> conc_per_cell;
// Create a scalar dataset "meta" and attach attributes to it (no explicit groups). // Build per-cell time series from the *entire* history.
H5Easy::dump(file, grp + "/meta", 0, H5Easy::DumpMode::Overwrite); for (const auto &metrics : metrics_history) {
const std::uint32_t iter = metrics.iteration;
H5Easy::dumpAttribute(file, grp + "/meta", "species_names", species_names, for (std::size_t cell_idx = 0; cell_idx < metrics.id.size(); ++cell_idx) {
H5Easy::DumpMode::Overwrite); const auto cell_id = metrics.id[cell_idx];
H5Easy::dumpAttribute(file, grp + "/meta", "iteration", metrics.iteration,
H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "rollback_count",
metrics.rb_count, H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "n_cells", n_cells,
H5Easy::DumpMode::Overwrite);
H5Easy::dumpAttribute(file, grp + "/meta", "n_species", n_species,
H5Easy::DumpMode::Overwrite);
// Dump only if data is non-empty to avoid corrupting the file on failures. // --- MAPE row: [iteration, mape...]
if (!metrics.mape.empty()) { Row mape_row;
H5Easy::dump(file, grp + "/mape", metrics.mape, mape_row.reserve(1 + n_species);
H5Easy::DumpMode::Overwrite); mape_row.push_back(static_cast<double>(iter));
}
if (!metrics.rrmse.empty()) { const auto &src_m = metrics.mape[cell_idx];
H5Easy::dump(file, grp + "/rrmse", metrics.rrmse, const std::size_t len_m = std::min(src_m.size(), n_species);
H5Easy::DumpMode::Overwrite); mape_row.insert(mape_row.end(), src_m.begin(), src_m.begin() + len_m);
if (len_m < n_species) {
mape_row.resize(1 + n_species, std::numeric_limits<double>::quiet_NaN());
}
// --- RRMSE row: [iteration, rrmse...]
Row rrmse_row;
rrmse_row.reserve(1 + n_species);
rrmse_row.push_back(static_cast<double>(iter));
const auto &src_r = metrics.rrmse[cell_idx];
const std::size_t len_r = std::min(src_r.size(), n_species);
rrmse_row.insert(rrmse_row.end(), src_r.begin(), src_r.begin() + len_r);
if (len_r < n_species) {
rrmse_row.resize(1 + n_species, std::numeric_limits<double>::quiet_NaN());
}
Row conc_row;
conc_row.reserve(1 + n_species);
conc_row.push_back(static_cast<double>(iter));
const auto &src_c = metrics.conc[cell_idx];
const std::size_t len_c = std::min(src_c.size(), n_species);
conc_row.insert(conc_row.end(), src_c.begin(), src_c.begin() + len_c);
if (len_c < n_species) {
conc_row.resize(1 + n_species, std::numeric_limits<double>::quiet_NaN());
}
// Append to per-cell matrices
mape_per_cell[cell_id].push_back(std::move(mape_row));
rrmse_per_cell[cell_id].push_back(std::move(rrmse_row));
conc_per_cell[cell_id].push_back(std::move(conc_row));
} }
} }
// Ensure all buffers are flushed to disk. // Now dump each cells full time series
file.flush(); for (const auto &kv : mape_per_cell) {
const auto cell_id = kv.first;
const auto &mape_matrix = kv.second;
const auto &rrmse_matrix = rrmse_per_cell[cell_id];
const auto &conc_matrix = conc_per_cell[cell_id];
const std::string cell_grp = "cells/" + std::to_string(cell_id);
H5Easy::dump(file, cell_grp + "/mape", mape_matrix, H5Easy::DumpMode::Overwrite);
H5Easy::dump(file, cell_grp + "/rrmse", rrmse_matrix, H5Easy::DumpMode::Overwrite);
H5Easy::dump(file, cell_grp + "/conc", conc_matrix, H5Easy::DumpMode::Overwrite);
}
return 0; return 0;
} }
void writeCellStatsToCSV(const std::vector<poet::CellMetrics> &all_stats, void writeCellStatsToCSV(const std::vector<poet::CellMetrics> &all_stats,
const std::vector<std::string> &species_names, const std::vector<std::string> &species_names,
const std::string &out_dir, const std::string &out_dir, const std::string &filename) {
const std::string &filename) {
std::ofstream out(std::filesystem::path(out_dir) / filename); std::ofstream out(std::filesystem::path(out_dir) / filename);
if (!out.is_open()) { if (!out.is_open()) {
std::cerr << "Could not open " << filename << " !" << std::endl; std::cerr << "Could not open " << filename << " !" << std::endl;
@ -83,8 +118,8 @@ void writeCellStatsToCSV(const std::vector<poet::CellMetrics> &all_stats,
// Header // Header
out << std::left << std::setw(15) << "CellID" << std::setw(15) << "Iteration" out << std::left << std::setw(15) << "CellID" << std::setw(15) << "Iteration"
<< std::setw(15) << "Rollback" << std::setw(15) << "Species" << std::setw(15) << "Rollback" << std::setw(15) << "Species" << std::setw(15)
<< std::setw(15) << "MAPE" << std::setw(15) << "RRMSE" << "MAPE" << std::setw(15) << "RRMSE"
<< "\n" << "\n"
<< std::string(90, '-') << "\n"; << std::string(90, '-') << "\n";
@ -92,25 +127,21 @@ void writeCellStatsToCSV(const std::vector<poet::CellMetrics> &all_stats,
for (const auto &metrics : all_stats) { for (const auto &metrics : all_stats) {
for (size_t cell_idx = 0; cell_idx < metrics.id.size(); ++cell_idx) { for (size_t cell_idx = 0; cell_idx < metrics.id.size(); ++cell_idx) {
for (size_t sp_idx = 0; sp_idx < species_names.size(); ++sp_idx) { for (size_t sp_idx = 0; sp_idx < species_names.size(); ++sp_idx) {
out << std::left << std::setw(15) << metrics.id[cell_idx] out << std::left << std::setw(15) << metrics.id[cell_idx] << std::setw(15)
<< std::setw(15) << metrics.iteration << metrics.iteration << std::setw(15) << metrics.rb_count << std::setw(15)
<< std::setw(15) << metrics.rb_count << species_names[sp_idx] << std::setw(15) << metrics.mape[cell_idx][sp_idx]
<< std::setw(15) << species_names[sp_idx]
<< std::setw(15) << metrics.mape[cell_idx][sp_idx]
<< std::setw(15) << metrics.rrmse[cell_idx][sp_idx] << "\n"; << std::setw(15) << metrics.rrmse[cell_idx][sp_idx] << "\n";
} }
} }
out << "\n"; out << "\n";
} }
out.close(); out.close();
std::cout << "Cell error metrics written to " << out_dir << "/" << filename std::cout << "Cell error metrics written to " << out_dir << "/" << filename << "\n";
<< "\n";
} }
void writeSpeciesStatsToCSV( void writeSpeciesStatsToCSV(const std::vector<poet::SpeciesMetrics> &all_stats,
const std::vector<poet::SpeciesMetrics> &all_stats, const std::vector<std::string> &species_names,
const std::vector<std::string> &species_names, const std::string &out_dir, const std::string &out_dir, const std::string &filename) {
const std::string &filename) {
std::ofstream out(std::filesystem::path(out_dir) / filename); std::ofstream out(std::filesystem::path(out_dir) / filename);
if (!out.is_open()) { if (!out.is_open()) {
std::cerr << "Could not open " << filename << " !" << std::endl; std::cerr << "Could not open " << filename << " !" << std::endl;
@ -118,9 +149,8 @@ void writeSpeciesStatsToCSV(
} }
// Header // Header
out << std::left << std::setw(15) << "Iteration" << std::setw(15) out << std::left << std::setw(15) << "Iteration" << std::setw(15) << "Rollback"
<< "Rollback" << std::setw(15) << "Species" << std::setw(15) << "MAPE" << std::setw(15) << "Species" << std::setw(15) << "MAPE" << std::setw(15) << "RRMSE"
<< std::setw(15) << "RRMSE"
<< "\n" << "\n"
<< std::string(75, '-') << "\n"; << std::string(75, '-') << "\n";
@ -128,13 +158,11 @@ void writeSpeciesStatsToCSV(
for (const auto &metrics : all_stats) { for (const auto &metrics : all_stats) {
for (size_t sp_idx = 0; sp_idx < species_names.size(); ++sp_idx) { for (size_t sp_idx = 0; sp_idx < species_names.size(); ++sp_idx) {
out << std::left << std::setw(15) << metrics.iteration << std::setw(15) out << std::left << std::setw(15) << metrics.iteration << std::setw(15)
<< metrics.rb_count << std::setw(15) << species_names[sp_idx] << metrics.rb_count << std::setw(15) << species_names[sp_idx] << std::setw(15)
<< std::setw(15) << metrics.mape[sp_idx] << std::setw(15) << metrics.mape[sp_idx] << std::setw(15) << metrics.rrmse[sp_idx] << "\n";
<< metrics.rrmse[sp_idx] << "\n";
} }
out << "\n"; out << "\n";
} }
out.close(); out.close();
std::cout << "Species error metrics written to " << out_dir << "/" << filename std::cout << "Species error metrics written to " << out_dir << "/" << filename << "\n";
<< "\n";
} }

View File

@ -1,13 +1,17 @@
#pragma once
#include "Control/ControlModule.hpp"
#include <string> #include <string>
#include <vector> #include <vector>
void writeSpeciesStatsToCSV( int write_metrics(const std::vector<poet::CellMetrics> &metrics_history,
const std::vector<poet::SpeciesMetrics> &all_stats, const std::vector<std::string> &species_names,
const std::vector<std::string> &species_names, const std::string &out_dir, const std::string &dir_path, const std::string &file_name);
const std::string &filename);
void writeCellStatsToCSV(const std::vector<poet::CellMetrics> &all_stats, void writeCellStatsToCSV(const std::vector<poet::CellMetrics> &all_stats,
const std::vector<std::string> &species_names, const std::vector<std::string> &species_names,
const std::string &out_dir, const std::string &out_dir, const std::string &filename);
const std::string &filename);
// namespace poet void writeSpeciesStatsToCSV(const std::vector<poet::SpeciesMetrics> &all_stats,
const std::vector<std::string> &species_names,
const std::string &out_dir, const std::string &filename);

View File

@ -117,11 +117,13 @@ int parseInitValues(int argc, char **argv, RuntimeParameters &params) {
// cout << "CPP: DHT size per process (Byte) = " << dht_size_per_process << // cout << "CPP: DHT size per process (Byte) = " << dht_size_per_process <<
// endl; // endl;
dht_group->add_option("--dht-snaps", params.dht_snaps, dht_group->add_option(
"Save snapshots of DHT to disk: \n0 = disabled (default)\n1 = After " "--dht-snaps", params.dht_snaps,
"simulation\n2 = After each iteration"); "Save snapshots of DHT to disk: \n0 = disabled (default)\n1 = After "
"simulation\n2 = After each iteration");
auto *interp_group = app.add_option_group("Interpolation", "Interpolation related options"); auto *interp_group =
app.add_option_group("Interpolation", "Interpolation related options");
interp_group->add_flag("--interp", params.use_interp, "Enable interpolation"); interp_group->add_flag("--interp", params.use_interp, "Enable interpolation");
interp_group interp_group
@ -143,7 +145,8 @@ int parseInitValues(int argc, char **argv, RuntimeParameters &params) {
app.add_flag("--ai-surrogate", params.use_ai_surrogate, app.add_flag("--ai-surrogate", params.use_ai_surrogate,
"Enable AI surrogate for chemistry module"); "Enable AI surrogate for chemistry module");
app.add_flag("--rds", params.as_rds, "Save output as .rds file instead of default .qs2"); app.add_flag("--rds", params.as_rds,
"Save output as .rds file instead of default .qs2");
app.add_flag("--qs", params.as_qs, "Save output as .qs file instead of default .qs2"); app.add_flag("--qs", params.as_qs, "Save output as .qs file instead of default .qs2");
@ -159,7 +162,8 @@ int parseInitValues(int argc, char **argv, RuntimeParameters &params) {
->required() ->required()
->check(CLI::ExistingFile); ->check(CLI::ExistingFile);
app.add_option("out_dir", params.out_dir, "Output directory of the simulation")->required(); app.add_option("out_dir", params.out_dir, "Output directory of the simulation")
->required();
try { try {
app.parse(argc, argv); app.parse(argc, argv);
@ -234,10 +238,15 @@ int parseInitValues(int argc, char **argv, RuntimeParameters &params) {
// MDL add "out_ext" for output format to R setup // MDL add "out_ext" for output format to R setup
(*global_rt_setup)["out_ext"] = params.out_ext; (*global_rt_setup)["out_ext"] = params.out_ext;
params.timesteps = Rcpp::as<std::vector<double>>(global_rt_setup->operator[]("timesteps")); params.timesteps =
params.chkpt_interval = Rcpp::as<uint32_t>(global_rt_setup->operator[]("chkpt_interval")); Rcpp::as<std::vector<double>>(global_rt_setup->operator[]("timesteps"));
params.chkpt_interval =
Rcpp::as<uint32_t>(global_rt_setup->operator[]("chkpt_interval"));
params.rb_limit = Rcpp::as<uint32_t>(global_rt_setup->operator[]("rb_limit")); params.rb_limit = Rcpp::as<uint32_t>(global_rt_setup->operator[]("rb_limit"));
params.stab_interval = Rcpp::as<uint32_t>(global_rt_setup->operator[]("stab_interval")); params.rb_interval_limit =
Rcpp::as<uint32_t>(global_rt_setup->operator[]("rb_interval_limit"));
params.stab_interval =
Rcpp::as<uint32_t>(global_rt_setup->operator[]("stab_interval"));
params.zero_abs = Rcpp::as<double>(global_rt_setup->operator[]("zero_abs")); params.zero_abs = Rcpp::as<double>(global_rt_setup->operator[]("zero_abs"));
params.mape_threshold = params.mape_threshold =
Rcpp::as<std::vector<double>>(global_rt_setup->operator[]("mape_threshold")); Rcpp::as<std::vector<double>>(global_rt_setup->operator[]("mape_threshold"));
@ -258,7 +267,8 @@ void call_master_iter_end(RInside &R, const Field &trans, const Field &chem) {
R["TMP"] = Rcpp::wrap(trans.AsVector()); R["TMP"] = Rcpp::wrap(trans.AsVector());
R["TMP_PROPS"] = Rcpp::wrap(trans.GetProps()); R["TMP_PROPS"] = Rcpp::wrap(trans.GetProps());
R.parseEval(std::string("state_T <- setNames(data.frame(matrix(TMP, nrow=" + R.parseEval(std::string("state_T <- setNames(data.frame(matrix(TMP, nrow=" +
std::to_string(trans.GetRequestedVecSize()) + ")), TMP_PROPS)")); std::to_string(trans.GetRequestedVecSize()) +
")), TMP_PROPS)"));
R["TMP"] = Rcpp::wrap(chem.AsVector()); R["TMP"] = Rcpp::wrap(chem.AsVector());
R["TMP_PROPS"] = Rcpp::wrap(chem.GetProps()); R["TMP_PROPS"] = Rcpp::wrap(chem.GetProps());
@ -286,7 +296,7 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters &params,
double dSimTime{0}; double dSimTime{0};
for (uint32_t iter = 1; iter < maxiter + 1; iter++) { for (uint32_t iter = 1; iter < maxiter + 1; iter++) {
control.beginIteration(chem, iter, params.use_dht, params.use_interp); control.beginIteration(iter, params.use_dht, params.use_interp);
double start_t = MPI_Wtime(); double start_t = MPI_Wtime();
@ -295,7 +305,8 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters &params,
std::cout << std::endl; std::cout << std::endl;
/* displaying iteration number, with C++ and R iterator */ /* displaying iteration number, with C++ and R iterator */
MSG("Going through iteration " + std::to_string(iter) + "/" + std::to_string(maxiter)); MSG("Going through iteration " + std::to_string(iter) + "/" +
std::to_string(maxiter));
MSG("Current time step is " + std::to_string(dt)); MSG("Current time step is " + std::to_string(dt));
@ -334,13 +345,14 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters &params,
chem.set_ai_surrogate_validity_vector(R.parseEval("validity_vector")); chem.set_ai_surrogate_validity_vector(R.parseEval("validity_vector"));
MSG("AI TempField"); MSG("AI TempField");
std::vector<std::vector<double>> RTempField = R.parseEval("set_valid_predictions(predictors,\ std::vector<std::vector<double>> RTempField =
R.parseEval("set_valid_predictions(predictors,\
aipreds,\ aipreds,\
validity_vector)"); validity_vector)");
MSG("AI Set Field"); MSG("AI Set Field");
Field predictions_field = Field predictions_field = Field(R.parseEval("nrow(predictors)"), RTempField,
Field(R.parseEval("nrow(predictors)"), RTempField, R.parseEval("colnames(predictors)")); R.parseEval("colnames(predictors)"));
MSG("AI Update"); MSG("AI Update");
chem.getField().update(predictions_field); chem.getField().update(predictions_field);
@ -386,9 +398,10 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters &params,
diffusion.getField().update(chem.getField()); diffusion.getField().update(chem.getField());
MSG("End of *coupling* iteration " + std::to_string(iter) + "/" + std::to_string(maxiter)); MSG("End of *coupling* iteration " + std::to_string(iter) + "/" +
std::to_string(maxiter));
control.processCheckpoint(diffusion, iter, params.out_dir, chem.getField().GetProps()); control.processCheckpoint(iter, params.out_dir, chem.getField().GetProps());
// MSG(); // MSG();
} // END SIMULATION LOOP } // END SIMULATION LOOP
@ -426,7 +439,8 @@ static Rcpp::List RunMasterLoop(RInsidePOET &R, RuntimeParameters &params,
chem_profiling["interp_w"] = Rcpp::wrap(chem.GetWorkerInterpolationWriteTimings()); chem_profiling["interp_w"] = Rcpp::wrap(chem.GetWorkerInterpolationWriteTimings());
chem_profiling["interp_r"] = Rcpp::wrap(chem.GetWorkerInterpolationReadTimings()); chem_profiling["interp_r"] = Rcpp::wrap(chem.GetWorkerInterpolationReadTimings());
chem_profiling["interp_g"] = Rcpp::wrap(chem.GetWorkerInterpolationGatherTimings()); chem_profiling["interp_g"] = Rcpp::wrap(chem.GetWorkerInterpolationGatherTimings());
chem_profiling["interp_fc"] = Rcpp::wrap(chem.GetWorkerInterpolationFunctionCallTimings()); chem_profiling["interp_fc"] =
Rcpp::wrap(chem.GetWorkerInterpolationFunctionCallTimings());
chem_profiling["interp_calls"] = Rcpp::wrap(chem.GetWorkerInterpolationCalls()); chem_profiling["interp_calls"] = Rcpp::wrap(chem.GetWorkerInterpolationCalls());
chem_profiling["interp_cached"] = Rcpp::wrap(chem.GetWorkerPHTCacheHits()); chem_profiling["interp_cached"] = Rcpp::wrap(chem.GetWorkerPHTCacheHits());
} }
@ -481,8 +495,8 @@ std::vector<std::string> getSpeciesNames(const Field &&field, int root, MPI_Comm
for (std::uint32_t i = 0; i < n_elements; i++) { for (std::uint32_t i = 0; i < n_elements; i++) {
n_string_size = field.GetProps()[i].size(); n_string_size = field.GetProps()[i].size();
MPI_Bcast(&n_string_size, 1, MPI_UINT32_T, root, MPI_COMM_WORLD); MPI_Bcast(&n_string_size, 1, MPI_UINT32_T, root, MPI_COMM_WORLD);
MPI_Bcast(const_cast<char *>(field.GetProps()[i].c_str()), n_string_size, MPI_CHAR, root, MPI_Bcast(const_cast<char *>(field.GetProps()[i].c_str()), n_string_size, MPI_CHAR,
MPI_COMM_WORLD); root, MPI_COMM_WORLD);
} }
return field.GetProps(); return field.GetProps();
@ -626,8 +640,8 @@ int main(int argc, char *argv[]) {
// // if (MY_RANK == 0) { // get timestep vector from // // if (MY_RANK == 0) { // get timestep vector from
// // grid_init function ... // // // grid_init function ... //
*global_rt_setup = *global_rt_setup = master_init_R(*global_rt_setup, run_params.out_dir,
master_init_R(*global_rt_setup, run_params.out_dir, init_list.getInitialGrid().asSEXP()); init_list.getInitialGrid().asSEXP());
// MDL: store all parameters // MDL: store all parameters
// MSG("Calling R Function to store calling parameters"); // MSG("Calling R Function to store calling parameters");
@ -658,12 +672,12 @@ int main(int argc, char *argv[]) {
DiffusionModule diffusion(init_list.getDiffusionInit(), init_list.getInitialGrid()); DiffusionModule diffusion(init_list.getDiffusionInit(), init_list.getInitialGrid());
ControlConfig config(run_params.stab_interval, run_params.chkpt_interval, run_params.rb_limit,
run_params.zero_abs, run_params.mape_threshold);
ControlModule control(config);
chemistry.masterSetField(init_list.getInitialGrid()); chemistry.masterSetField(init_list.getInitialGrid());
ControlConfig config(run_params.stab_interval, run_params.chkpt_interval,
run_params.rb_limit, run_params.rb_interval_limit,
run_params.zero_abs, run_params.mape_threshold);
ControlModule control(config, chemistry);
chemistry.SetControlModule(&control); chemistry.SetControlModule(&control);
Rcpp::List profiling = RunMasterLoop(R, run_params, diffusion, chemistry, control); Rcpp::List profiling = RunMasterLoop(R, run_params, diffusion, chemistry, control);
@ -679,8 +693,8 @@ int main(int argc, char *argv[]) {
"'/timings.', setup$out_ext));"; "'/timings.', setup$out_ext));";
R.parseEval(r_vis_code); R.parseEval(r_vis_code);
MSG("Done! Results are stored as R objects into <" + run_params.out_dir + "/timings." + MSG("Done! Results are stored as R objects into <" + run_params.out_dir +
run_params.out_ext); "/timings." + run_params.out_ext);
} }
} }

View File

@ -53,6 +53,7 @@ struct RuntimeParameters {
std::uint32_t stab_interval = 0; std::uint32_t stab_interval = 0;
std::uint32_t chkpt_interval = 0; std::uint32_t chkpt_interval = 0;
std::uint32_t rb_limit = 0; std::uint32_t rb_limit = 0;
std::uint32_t rb_interval_limit = 0;
double zero_abs = 0.0; double zero_abs = 0.0;
std::vector<double> mape_threshold; std::vector<double> mape_threshold;
std::vector<uint32_t> ctrl_cell_ids; std::vector<uint32_t> ctrl_cell_ids;