65#include "fenix_exception.hpp"
66#include "fenix_data_subset.hpp"
106constexpr MlogRecoveryMode INLINE_AUTOSYNC =
114constexpr int STOREV_ALL = FENIX_STOREV_ALL;
116enum CallbackLocation { PRE_RECOVERY, POST_RECOVERY };
121 MPI_Comm in_comm = MPI_COMM_WORLD;
122 MPI_Comm* out_comm =
nullptr;
124 char*** argv =
nullptr;
133void set_option(
SettingName setting,
unsigned option);
139void throw_exception();
150using FenixCallbackFunc = std::function<void(MPI_Comm,
int)>;
153int callback_register(
154 FenixCallbackFunc callback, CallbackLocation loc = POST_RECOVERY
158int callback_pop(CallbackLocation loc = POST_RECOVERY);
161int callback_invoke_all(CallbackLocation loc = POST_RECOVERY);
167std::vector<int> fail_list();
170int detect_failures(
bool recover =
true);
180namespace fenix::data {
182extern const DataSubset& SUBSET_FULL;
183extern const DataSubset& SUBSET_EMPTY;
184extern const DataSubset& SUBSET_PRESTAGED;
185extern DataSubset SUBSET_IGNORE;
189 int group_id, MPI_Comm comm,
int start_time_stamp,
int depth,
int policy_name,
190 void* policy_value,
int* flag
195 MPI_Comm comm = MPI_COMM_NULL;
196 int start_time_stamp = 0;
198 int policy_name = FENIX_DATA_POLICY_IMR;
199 void* policy_value =
nullptr;
205bool group_created(
int group_id);
209 int group_id,
int member_id,
void* buffer,
int count, MPI_Datatype datatype
213bool member_created(
int group_id,
int member_id);
217 int group_id,
int member_id,
const DataSubset& subset = SUBSET_FULL
222 int group_id,
int member_id,
const DataSubset& subset = SUBSET_FULL
225inline int member_store(
int group_id,
const DataSubset& subset = SUBSET_FULL) {
226 return member_store(group_id, FENIX_DATA_MEMBER_ALL, subset);
230int member_storev(
int group_id,
int member_id,
const DataSubset& subset);
232inline int member_storev(
int group_id,
const DataSubset& subset) {
233 return member_storev(group_id, FENIX_DATA_MEMBER_ALL, subset);
238 int group_id,
int member_id,
const DataSubset& subset,
Fenix_Request* request
242 int member_id = FENIX_DATA_MEMBER_ALL;
246inline int member_istore(
249 return member_istore(group_id, args.member_id, args.subset, request);
254 int group_id,
int member_id,
const DataSubset& subset,
Fenix_Request* request
257inline int member_istorev(
258 int group_id,
const DataSubset& subset,
Fenix_Request* request
260 return member_istorev(group_id, FENIX_DATA_MEMBER_ALL, subset, request);
265 int group_id,
int member_id,
void* target_buffer,
int max_length,
266 int time_stamp = FENIX_DATA_SNAPSHOT_LATEST,
267 DataSubset& data_found = SUBSET_IGNORE
272 int group_id,
int member_id,
void* target_buffer,
int max_length,
273 int time_stamp, DataSubset& data_found
277int commit(
int group_id,
int* time_stamp =
nullptr);
280int commit_barrier(
int group_id,
int* time_stamp =
nullptr);
284 int group_id,
const DataSubset& subset,
285 const std::vector<int>& storev_ids = {},
int* time_stamp = nullptr
290 int group_id,
const DataSubset& subset,
int* time_stamp =
nullptr
297std::optional<std::vector<int>> group_members(
int group_id);
303std::optional<std::vector<int>> group_snapshots(
int group_id);
306int snapshot_delete(
int group_id,
int timestamp);
309int group_delete(
int group_id);
312int member_delete(
int group_id,
int member_id);
316namespace fenix::mlog {
319int create(
int mlog_id, MPI_Comm& comm,
int depth);
322int activate(
int mlog_id);
328int begin_region(
int mlog_id,
int region_id);
331int activate(
int mlog_id,
int region_id);
334int sync(
int mlog_id,
int region_id = FENIX_MLOG_CONTINUE);
337int stage(
int mlog_id,
int group_id,
int member_id);
341 int mlog_id,
int group_id,
int member_id,
342 int time_stamp = FENIX_DATA_SNAPSHOT_LATEST
346int mlog_delete(
int mlog_id);
Contains all API function calls and Fenix types. This is the only header file a user should include.
Fenix_Unhandled_mode
Options for dealing with 'unhandled' errors, e.g. invalid rank IDs.
Definition fenix.h:262
Fenix_Resume_mode
Options for passing control back to application after recovery.
Definition fenix.h:235
Fenix_Recovery_mode
Options for recovering after a failed rank is detected.
Definition fenix.h:193
Fenix_Spare_wait_mode
Options for how spare ranks wait to be needed. Must be set before Fenix_Init to take effect.
Definition fenix.h:278
Fenix_Callback_exception_mode
Options for dealing with CommExceptions generated in callbacks.
Definition fenix.h:293
Fenix_Setting_name
Global Fenix settings.
Definition fenix.h:172
Fenix_Mlog_recovery_mode
Definition fenix.h:211
Fenix_Rank_role
All possible roles returned by Fenix_Init.
Definition fenix.h:160
@ FENIX_UNHANDLED_PRINT
Print error and continue without handling.
Definition fenix.h:266
@ FENIX_UNHANDLED_SILENT
Ignore unhandled errors.
Definition fenix.h:264
@ FENIX_UNHANDLED_ABORT
Print error and abort Fenix's world (default)
Definition fenix.h:268
@ FENIX_RESUME_THROW
Throw a fenix::CommException.
Definition fenix.h:253
@ FENIX_RESUME_JUMP
Return to Fenix_Init via longjmp (default)
Definition fenix.h:249
@ FENIX_RESUME_RETURN
Return the error code inline.
Definition fenix.h:251
@ FENIX_RECOVERY_IGNORE
Do not repair communicator, immediately resume per FENIX_RESUME_MODE.
Definition fenix.h:195
@ FENIX_RECOVERY_REPAIR
Repair the communicator with spares or by shrinking.
Definition fenix.h:203
@ FENIX_RECOVERY_SPAWN
<span class="mlabel"> UNIMPLEMENTED </span> As REPAIR, but attempt to respawn failed processes
Definition fenix.h:205
@ FENIX_RECOVERY_NOOP
Do not repair communicator, otherwise behave normally.
Definition fenix.h:201
@ FENIX_SPARE_WAIT_YIELD
Tell MPI to yield this thread while waiting (if supported, else busy wait)
Definition fenix.h:282
@ FENIX_SPARE_WAIT_SLEEP
Sleep 100ms between checks to see if this thread is needed for recovery.
Definition fenix.h:284
@ FENIX_SPARE_WAIT_BUSY
Busy wait, consuming CPU time in exchange for faster response.
Definition fenix.h:280
@ FENIX_CALLBACK_EXCEPTION_SQUASH
CommExceptions from callbacks are squashed.
Definition fenix.h:297
@ FENIX_CALLBACK_EXCEPTION_RETHROW
CommExceptions are allowed to propagate out of callbacks.
Definition fenix.h:295
@ FENIX_RESUME_MODE
See Fenix_Resume_mode.
Definition fenix.h:176
@ FENIX_MLOG_RECOVERY_MODE
See Fenix_Mlog_recovery_mode.
Definition fenix.h:182
@ FENIX_SPARE_WAIT_MODE
See Fenix_Spare_wait_mode.
Definition fenix.h:184
@ FENIX_CALLBACK_EXCEPTION_MODE
See Fenix_Callback_exception_mode.
Definition fenix.h:180
@ FENIX_RECOVERY_MODE
See Fenix_Recovery_mode.
Definition fenix.h:174
@ FENIX_UNHANDLED_MODE
See Fenix_Unhandled_mode.
Definition fenix.h:178
@ FENIX_MLOG_RECOVERY_MANUAL
All message logging recovery is manual.
Definition fenix.h:213
@ FENIX_MLOG_RECOVERY_INLINE_AUTOSYNC
As INLINE, but automatically sync logs with FENIX_MLOG_CONTINUE.
Definition fenix.h:227
@ FENIX_MLOG_RECOVERY_INLINE
Automatically repeats failed, logged MPI operations without disrupting normal application control flo...
Definition fenix.h:220
@ FENIX_ROLE_RECOVERED_RANK
This rank was a spare before the most recent failure, or was just spawned.
Definition fenix.h:164
@ FENIX_ROLE_INITIAL_RANK
No failures have occurred yet.
Definition fenix.h:162
@ FENIX_ROLE_SURVIVOR_RANK
This rank was not a spare before the most recent failure.
Definition fenix.h:166
<span class="mlabel"> UNIMPLEMENTED </span> As MPI_Request, but for Fenix asynchronous data recove...
Definition fenix.h:570
Definition fenix_data_subset.hpp:137