63#include "fenix_init.h"
65#if defined(MPIX_ERR_PROC_FAILED) && !defined(MPI_ERR_PROC_FAILED)
66#define MPI_ERR_PROC_FAILED MPIX_ERR_PROC_FAILED
69#if defined(MPIX_ERR_PROC_FAILED_PENDING) && \
70 !defined(MPI_ERR_PROC_FAILED_PENDING)
71#define MPI_ERR_PROC_FAILED_PENDING MPIX_ERR_PROC_FAILED_PENDING
74#if defined(MPIX_ERR_REVOKED) && !defined(MPI_ERR_REVOKED)
75#define MPI_ERR_REVOKED MPIX_ERR_REVOKED
78#if defined(c_plusplus) || defined(__cplusplus)
97 FENIX_ERROR_UNINITIALIZED = -100,
98 FENIX_ERROR_NOCATEGORY,
99 FENIX_ERROR_CALLBACK_NOT_REGISTERED,
100 FENIX_ERROR_GROUP_CREATE,
101 FENIX_ERROR_MEMBER_CREATE,
102 FENIX_ERROR_MEMBER_EXISTS,
103 FENIX_ERROR_COMMIT_BARRIER,
104 FENIX_ERROR_INVALID_GROUPID,
105 FENIX_ERROR_INVALID_MEMBERID,
106 FENIX_ERROR_INVALID_LOGIC_CALL,
107 FENIX_ERROR_INVALID_POLICY_NAME,
108 FENIX_ERROR_INVALID_TIMESTAMP,
109 FENIX_ERROR_INVALID_TIMESTART,
110 FENIX_ERROR_INVALID_DEPTH,
111 FENIX_ERROR_INVALID_ATTRIBUTE_NAME,
112 FENIX_ERROR_INVALID_ATTRIBUTE_VALUE,
113 FENIX_ERROR_INVALID_POSITION,
114 FENIX_ERROR_DATA_WAIT,
115 FENIX_ERROR_SUBSET_NUM_BLOCKS,
116 FENIX_ERROR_SUBSET_START_OFFSET,
117 FENIX_ERROR_SUBSET_END_OFFSET,
118 FENIX_ERROR_SUBSET_STRIDE,
119 FENIX_ERROR_NODATA_FOUND,
121 FENIX_ERROR_CANCELLED,
122 FENIX_ERROR_INVALID_SETTING_NAME,
123 FENIX_ERROR_INVALID_SETTING_OPTION,
124 FENIX_ERROR_INVALID_MLOGID,
125 FENIX_ERROR_MLOG_EXISTS,
126 FENIX_ERROR_MLOG_LIBRARY_UNAVAILABLE,
127 FENIX_ERROR_PROCESS_FAILURE,
129 FENIX_WARNING_SPARE_RANKS_DEPLETED = 100,
130 FENIX_WARNING_PARTIAL_RESTORE,
135#define FENIX_ERRHANDLER_LOC 1
137#define FENIX_FINALIZE_LOC 2
139#define FENIX_DATA_COMMIT_BARRIER_LOC 4
366#define Fenix_Init(_role, _comm, _newcomm, _argc, _argv, _spare_ranks, _err) \
368 static jmp_buf bufjmp; \
369 *(_role) = __fenix_preinit( \
370 _role, _comm, _newcomm, _argc, _argv, _spare_ranks, _err, &bufjmp \
373 __fenix_postinit(); \
442 void *callback_data);
549#define FENIX_DATA_GROUP_WORLD_ID 10
550#define FENIX_GROUP_ID_MAX 11
551#define FENIX_DATA_MEMBER_ALL -1
552#define FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER 11
553#define FENIX_DATA_MEMBER_ATTRIBUTE_COUNT 12
554#define FENIX_DATA_MEMBER_ATTRIBUTE_DATATYPE 13
555#define FENIX_DATA_MEMBER_ATTRIBUTE_SIZE 14
556#define FENIX_DATA_SNAPSHOT_LATEST -1
557#define FENIX_DATA_SNAPSHOT_ALL -2
558#define FENIX_RESIZEABLE 0
559#define FENIX_DATA_SUBSET_CREATED 2
560#define FENIX_STOREV_ALL -1
562#define FENIX_DATA_POLICY_IN_MEMORY_RAID 13
563#define FENIX_DATA_POLICY_IMR FENIX_DATA_POLICY_IN_MEMORY_RAID
565#define FENIX_TIME_STAMP_IGNORE NULL
571 MPI_Request mpi_send_req;
572 MPI_Request mpi_recv_req;
634 int depth,
int policy_name,
void* policy_value,
669 int count, MPI_Datatype datatype);
692 void *policy_value,
int *flag);
813 int num_storev,
int* storev_ids,
int* time_stamp);
864 int max_count,
int time_stamp,
910 int *array_end_offsets,
953 int *number_of_snapshots);
971 void *attributevalue,
int *flag,
int source_rank);
991 void *attribute_value,
int *flag);
1030#define FENIX_MLOG_NONE -1
1031#define FENIX_MLOG_CONTINUE -1
1156#if defined(c_plusplus) || defined(__cplusplus)
int Fenix_Data_group_created(int group_id)
Query if a data group exists on this rank.
const Fenix_Data_subset FENIX_DATA_SUBSET_FULL
A standin for checkpointing/recovering the full member's data.
Definition fenix.cpp:71
int Fenix_Data_checkpoint(int group_id, const Fenix_Data_subset subset, int num_storev, int *storev_ids, int *time_stamp)
Store all members of a group and then commit that group.
Definition fenix.cpp:345
int Fenix_Data_subset_delete(Fenix_Data_subset *subset_specifier)
Delete a data subset.
Definition fenix.cpp:426
int Fenix_Data_group_get_redundancy_policy(int group_id, int *policy_name, void *policy_value, int *flag)
Get the storage policy of a data group.
int Fenix_Data_member_istore(int group_id, int member_id, const Fenix_Data_subset subset_specifier, Fenix_Request *request)
<span class="mlabel"> UNIMPLEMENTED </span> As [store](Fenix_Data_member_store),...
Definition fenix.cpp:313
int Fenix_Data_group_delete(int group_id)
Delete a data group.
Definition fenix.cpp:440
const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY
A standin for checkpointing/recovering no data.
Definition fenix.cpp:72
int Fenix_Data_group_create(int group_id, MPI_Comm comm, int start_time_stamp, int depth, int policy_name, void *policy_value, int *flag)
Create a Data Group.
Definition fenix.cpp:278
int Fenix_Data_member_attr_get(int group_id, int member_id, int attributename, void *attributevalue, int *flag, int source_rank)
<span class="mlabel"> UNIMPLEMENTED </span> Get the value of a member's attribute.
Definition fenix_data_recovery.cpp:398
int Fenix_Data_member_create(int group_id, int member_id, void *buffer, int count, MPI_Datatype datatype)
Create a data member for store/restore operations.
Definition fenix.cpp:289
int Fenix_Data_member_restore(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, Fenix_Data_subset *found_data)
Restore the data of a group member from a snapshot.
Definition fenix.cpp:368
int Fenix_Data_member_istorev(int group_id, int member_id, const Fenix_Data_subset subset_specifier, Fenix_Request *request)
<span class="mlabel"> UNIMPLEMENTED </span> As [istore](Fenix_Data_member_istore),...
Definition fenix.cpp:322
int Fenix_Data_snapshot_delete(int group_id, int time_stamp)
Delete a snapshot from a data group.
Definition fenix.cpp:434
int Fenix_Data_member_storev(int group_id, int member_id, const Fenix_Data_subset subset_specifier)
<span class="mlabel"> UNIMPLEMENTED </span> As [store](Fenix_Data_member_store),...
Definition fenix.cpp:305
int Fenix_Data_member_lrestore(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, Fenix_Data_subset *found_data)
Local-only version of Fenix_Data_member_restore.
Definition fenix.cpp:386
int Fenix_Data_group_get_number_of_snapshots(int group_id, int *number_of_snapshots)
Get the number of locally-available snapshots in a data group.
Definition fenix_data_recovery.cpp:384
const Fenix_Data_subset FENIX_DATA_SUBSET_PRESTAGED
A standin for checkpointing/recovering all of pre-staged data.
Definition fenix.cpp:73
int Fenix_Data_member_created(int group_id, int member_id)
Query if a data member exists on this rank.
int Fenix_Data_group_get_number_of_members(int group_id, int *number_of_members)
Get the number of members in a data group.
Definition fenix_data_recovery.cpp:433
int Fenix_Data_group_get_member_at_position(int group_id, int *member_id, int position)
Get member ID based on member index.
Definition fenix_data_recovery.cpp:440
int Fenix_Data_wait(Fenix_Request request)
<span class="mlabel"> UNIMPLEMENTED </span> Block on completion of the store operation specified b...
Definition fenix_data_recovery.cpp:455
int Fenix_Data_test(Fenix_Request request, int *flag)
<span class="mlabel"> UNIMPLEMENTED </span> Query completion of the store operation specified by t...
Definition fenix_data_recovery.cpp:463
int Fenix_Data_member_restore_from_rank(int group_id, int member_id, void *data, int max_count, int time_stamp, Fenix_Data_subset *found_data, int source_rank)
<span class="mlabel"> UNIMPLEMENTED </span> As Fenix_Data_member_restore, but restores from a spec...
int Fenix_Data_subset_createv(int num_blocks, int *array_start_offsets, int *array_end_offsets, Fenix_Data_subset *subset_specifier)
As Fenix_Data_subset_create, but with varying start and end offsets.
Definition fenix.cpp:415
int Fenix_Data_member_store(int group_id, int member_id, const Fenix_Data_subset subset_specifier)
Store a particular group member into the group's resilient storage space, in uncommitted storage.
Definition fenix.cpp:297
int Fenix_Data_member_delete(int group_id, int member_id)
Delete a data member.
Definition fenix.cpp:446
int Fenix_Data_commit_barrier(int group_id, int *time_stamp)
As commit, but ensures a globally consistent commit.
Definition fenix.cpp:339
int Fenix_Data_member_attr_set(int group_id, int member_id, int attribute_name, void *attribute_value, int *flag)
Set the value of a member's attribute.
Definition fenix_data_recovery.cpp:345
int Fenix_Data_member_stage(int group_id, int member_id, const Fenix_Data_subset subset_specifier)
Serialize a group member's data into the member's local store.
int Fenix_Data_commit(int group_id, int *time_stamp)
Commit stored data members to the group's next snapshot.
Definition fenix.cpp:333
int Fenix_Data_group_get_snapshot_at_position(int group_id, int position, int *time_stamp)
Get the time stamp of a snapshot at a given index.
Definition fenix_data_recovery.cpp:390
int Fenix_Data_barrier(int group_id)
<span class="mlabel"> UNIMPLEMENTED </span> Block until all ranks in the group have reached this p...
Definition fenix.cpp:362
int Fenix_Data_subset_create(int num_blocks, int start_offset, int end_offset, int stride, Fenix_Data_subset *subset_specifier)
Create a data subset for use in store operations.
Definition fenix.cpp:404
int Fenix_Mlog_delete(int mlog_id)
Delete an mlog.
Definition fenix.cpp:503
int Fenix_Mlog_begin_region(int mlog_id, int region_id)
Set the region of the given message logger.
Definition fenix.cpp:471
int Fenix_Mlog_activate_region(int mlog_id, int region_id)
Activate the mlog and begin the region.
Definition fenix.cpp:477
int Fenix_Mlog_sync(int mlog_id, int region_id)
Synchronize messages across ranks each starting at their given region.
Definition fenix.cpp:483
int Fenix_Mlog_active(int *mlog_id)
Get the currently active message log.
Definition fenix.cpp:464
int Fenix_Mlog_create(int mlog_id, MPI_Comm *comm, int depth)
Create a new message logger.
Definition fenix.cpp:452
int Fenix_Mlog_lrestore(int mlog_id, int group_id, int member_id, int time_stamp)
Restore an mlog from a Fenix data member's local snapshot.
Definition fenix.cpp:495
int Fenix_Mlog_stage(int mlog_id, int group_id, int member_id)
Stage an mlog's data into a Fenix data member.
Definition fenix.cpp:489
int Fenix_Mlog_activate(int mlog_id)
Active a given mlog, deactivating any previously active mlog.
Definition fenix.cpp:458
Fenix_Unhandled_mode
Options for dealing with 'unhandled' errors, e.g. invalid rank IDs.
Definition fenix.h:262
int Fenix_get_option(Fenix_Setting_name setting, unsigned *option)
Get the current option for a Fenix setting.
Definition fenix.cpp:91
Fenix_Resume_mode
Options for passing control back to application after recovery.
Definition fenix.h:235
int Fenix_Finalized(int *flag)
Sets flag to true if Fenix_Finalize has been called, else false.
Definition fenix.cpp:126
int Fenix_get_error()
Returns the error value from Fenix_Init or the latest recovery.
Definition fenix.cpp:163
int Fenix_Callback_pop()
Pop the most recently registered callback from the callback stack.
Definition fenix.cpp:109
int Fenix_Callback_register(void(*recover)(MPI_Comm, int, void *), void *callback_data)
Register a callback to be invoked after failure process recovery.
Definition fenix.cpp:98
Fenix_Recovery_mode
Options for recovering after a failed rank is detected.
Definition fenix.h:193
Fenix_Spare_wait_mode
Options for how spare ranks wait to be needed. Must be set before Fenix_Init to take effect.
Definition fenix.h:278
int Fenix_Process_fail_list(int **fail_list)
Get the list of ranks that failed in the most recent failure.
Definition fenix.cpp:131
int Fenix_get_nspare()
Returns the number of spare ranks currently available to Fenix.
Definition fenix.cpp:168
int Fenix_get_number_of_ranks_with_role(int, int *)
<span class="mlabel"> UNIMPLEMENTED </span> Returns the number of ranks with a given Fenix_Rank_ro...
int Fenix_Initialized(int *flag)
Sets flag to true if Fenix_Init has been called, else false.
Definition fenix.cpp:121
int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status)
Check a pre-recovery request without error.
Definition fenix.cpp:138
int Fenix_set_option(Fenix_Setting_name setting, unsigned option)
Configure a global Fenix setting.
Definition fenix.cpp:76
int Fenix_Finalize()
Clean up Fenix state. Each active rank must call Fenix_Finalize before exiting.
Definition fenix_process_recovery.cpp:860
int Fenix_Process_detect_failures(int do_recovery)
Check for any failed ranks.
Definition fenix.cpp:152
Fenix_Rank_role Fenix_get_role()
Returns this rank's Fenix_Rank_role.
Definition fenix.cpp:158
int Fenix_Callback_invoke_all()
Invoke all callbacks with information from the last recovered fault.
Definition fenix.cpp:115
Fenix_Callback_exception_mode
Options for dealing with CommExceptions generated in callbacks.
Definition fenix.h:293
Fenix_Setting_name
Global Fenix settings.
Definition fenix.h:172
Fenix_Mlog_recovery_mode
Definition fenix.h:211
Fenix_Rank_role
All possible roles returned by Fenix_Init.
Definition fenix.h:160
int Fenix_get_rank_role(MPI_Comm comm, int rank, int *role)
<span class="mlabel"> UNIMPLEMENTED </span> Returns the Fenix_Rank_role for a given rank
@ FENIX_UNHANDLED_PRINT
Print error and continue without handling.
Definition fenix.h:266
@ FENIX_UNHANDLED_SILENT
Ignore unhandled errors.
Definition fenix.h:264
@ FENIX_UNHANDLED_ABORT
Print error and abort Fenix's world (default)
Definition fenix.h:268
@ FENIX_UNHANDLED_MODE_MAXCODE
Not a valid option.
Definition fenix.h:271
@ FENIX_RESUME_MODE_MAXCODE
Not a valid option.
Definition fenix.h:256
@ FENIX_RESUME_THROW
Throw a fenix::CommException.
Definition fenix.h:253
@ FENIX_RESUME_JUMP
Return to Fenix_Init via longjmp (default)
Definition fenix.h:249
@ FENIX_RESUME_RETURN
Return the error code inline.
Definition fenix.h:251
@ FENIX_RECOVERY_IGNORE
Do not repair communicator, immediately resume per FENIX_RESUME_MODE.
Definition fenix.h:195
@ FENIX_RECOVERY_REPAIR
Repair the communicator with spares or by shrinking.
Definition fenix.h:203
@ FENIX_RECOVERY_MODE_MAXCODE
Not a valid option.
Definition fenix.h:208
@ FENIX_RECOVERY_SPAWN
<span class="mlabel"> UNIMPLEMENTED </span> As REPAIR, but attempt to respawn failed processes
Definition fenix.h:205
@ FENIX_RECOVERY_NOOP
Do not repair communicator, otherwise behave normally.
Definition fenix.h:201
@ FENIX_SPARE_WAIT_YIELD
Tell MPI to yield this thread while waiting (if supported, else busy wait)
Definition fenix.h:282
@ FENIX_SPARE_WAIT_MODE_MAXCODE
Not a valid option.
Definition fenix.h:287
@ FENIX_SPARE_WAIT_SLEEP
Sleep 100ms between checks to see if this thread is needed for recovery.
Definition fenix.h:284
@ FENIX_SPARE_WAIT_BUSY
Busy wait, consuming CPU time in exchange for faster response.
Definition fenix.h:280
@ FENIX_CALLBACK_EXCEPTION_MODE_MAXCODE
Not a valid option.
Definition fenix.h:300
@ FENIX_CALLBACK_EXCEPTION_SQUASH
CommExceptions from callbacks are squashed.
Definition fenix.h:297
@ FENIX_CALLBACK_EXCEPTION_RETHROW
CommExceptions are allowed to propagate out of callbacks.
Definition fenix.h:295
@ FENIX_SETTING_NAME_MAXCODE
Not a valid option.
Definition fenix.h:187
@ FENIX_RESUME_MODE
See Fenix_Resume_mode.
Definition fenix.h:176
@ FENIX_MLOG_RECOVERY_MODE
See Fenix_Mlog_recovery_mode.
Definition fenix.h:182
@ FENIX_SPARE_WAIT_MODE
See Fenix_Spare_wait_mode.
Definition fenix.h:184
@ FENIX_CALLBACK_EXCEPTION_MODE
See Fenix_Callback_exception_mode.
Definition fenix.h:180
@ FENIX_RECOVERY_MODE
See Fenix_Recovery_mode.
Definition fenix.h:174
@ FENIX_UNHANDLED_MODE
See Fenix_Unhandled_mode.
Definition fenix.h:178
@ FENIX_MLOG_RECOVERY_MANUAL
All message logging recovery is manual.
Definition fenix.h:213
@ FENIX_MLOG_RECOVERY_INLINE_AUTOSYNC
As INLINE, but automatically sync logs with FENIX_MLOG_CONTINUE.
Definition fenix.h:227
@ FENIX_MLOG_RECOVERY_INLINE
Automatically repeats failed, logged MPI operations without disrupting normal application control flo...
Definition fenix.h:220
@ FENIX_MLOG_RECOVERY_MODE_MAXCODE
Not a valid option.
Definition fenix.h:230
@ FENIX_ROLE_RECOVERED_RANK
This rank was a spare before the most recent failure, or was just spawned.
Definition fenix.h:164
@ FENIX_ROLE_INITIAL_RANK
No failures have occurred yet.
Definition fenix.h:162
@ FENIX_ROLE_SURVIVOR_RANK
This rank was not a spare before the most recent failure.
Definition fenix.h:166
Represents a data subset that can be stored/recovered.
Definition fenix.h:585
<span class="mlabel"> UNIMPLEMENTED </span> As MPI_Request, but for Fenix asynchronous data recove...
Definition fenix.h:570