Fenix @develop
 
Loading...
Searching...
No Matches
fenix.h
Go to the documentation of this file.
1/*
2//@HEADER
3// ************************************************************************
4//
5//
6// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _|
7// _| _| _|_| _| _| _| _|
8// _|_|_| _|_|_| _| _| _| _| _|
9// _| _| _| _|_| _| _| _|
10// _| _|_|_|_| _| _| _|_|_| _| _|
11//
12//
13//
14//
15// Copyright (C) 2016 Rutgers University and Sandia Corporation
16//
17// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
18// the U.S. Government retains certain rights in this software.
19//
20// Redistribution and use in source and binary forms, with or without
21// modification, are permitted provided that the following conditions are
22// met:
23//
24// 1. Redistributions of source code must retain the above copyright
25// notice, this list of conditions and the following disclaimer.
26//
27// 2. Redistributions in binary form must reproduce the above copyright
28// notice, this list of conditions and the following disclaimer in the
29// documentation and/or other materials provided with the distribution.
30//
31// 3. Neither the name of the Corporation nor the names of the
32// contributors may be used to endorse or promote products derived from
33// this software without specific prior written permission.
34//
35// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
36// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
39// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
40// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
41// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
42// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
43// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
44// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46//
47// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
48// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock
49//
50// Questions? Contact Keita Teranishi (knteran@sandia.gov) and
51// Marc Gamell (mgamell@cac.rutgers.edu)
52//
53// ************************************************************************
54//@HEADER
55*/
56
57#ifndef __FENIX__
58#define __FENIX__
59
60#include <mpi.h>
61#include <setjmp.h>
62
63#include "fenix_init.h"
64
65#if defined(MPIX_ERR_PROC_FAILED) && !defined(MPI_ERR_PROC_FAILED)
66#define MPI_ERR_PROC_FAILED MPIX_ERR_PROC_FAILED
67#endif
68
69#if defined(MPIX_ERR_PROC_FAILED_PENDING) && \
70 !defined(MPI_ERR_PROC_FAILED_PENDING)
71#define MPI_ERR_PROC_FAILED_PENDING MPIX_ERR_PROC_FAILED_PENDING
72#endif
73
74#if defined(MPIX_ERR_REVOKED) && !defined(MPI_ERR_REVOKED)
75#define MPI_ERR_REVOKED MPIX_ERR_REVOKED
76#endif
77
78#if defined(c_plusplus) || defined(__cplusplus)
79extern "C" {
80#endif
81
94typedef enum {
95 FENIX_SUCCESS = 0,
96 // Error values are negative
97 FENIX_ERROR_UNINITIALIZED = -100,
98 FENIX_ERROR_NOCATEGORY,
99 FENIX_ERROR_CALLBACK_NOT_REGISTERED,
100 FENIX_ERROR_GROUP_CREATE,
101 FENIX_ERROR_MEMBER_CREATE,
102 FENIX_ERROR_MEMBER_EXISTS,
103 FENIX_ERROR_COMMIT_BARRIER,
104 FENIX_ERROR_INVALID_GROUPID,
105 FENIX_ERROR_INVALID_MEMBERID,
106 FENIX_ERROR_INVALID_LOGIC_CALL,
107 FENIX_ERROR_INVALID_POLICY_NAME,
108 FENIX_ERROR_INVALID_TIMESTAMP,
109 FENIX_ERROR_INVALID_TIMESTART,
110 FENIX_ERROR_INVALID_DEPTH,
111 FENIX_ERROR_INVALID_ATTRIBUTE_NAME,
112 FENIX_ERROR_INVALID_ATTRIBUTE_VALUE,
113 FENIX_ERROR_INVALID_POSITION,
114 FENIX_ERROR_DATA_WAIT,
115 FENIX_ERROR_SUBSET_NUM_BLOCKS,
116 FENIX_ERROR_SUBSET_START_OFFSET,
117 FENIX_ERROR_SUBSET_END_OFFSET,
118 FENIX_ERROR_SUBSET_STRIDE,
119 FENIX_ERROR_NODATA_FOUND,
120 FENIX_ERROR_INTERN,
121 FENIX_ERROR_CANCELLED,
122 FENIX_ERROR_INVALID_SETTING_NAME,
123 FENIX_ERROR_INVALID_SETTING_OPTION,
124 FENIX_ERROR_INVALID_MLOGID,
125 FENIX_ERROR_MLOG_EXISTS,
126 FENIX_ERROR_MLOG_LIBRARY_UNAVAILABLE,
127 FENIX_ERROR_PROCESS_FAILURE,
128 //Warnings are positive
129 FENIX_WARNING_SPARE_RANKS_DEPLETED = 100,
130 FENIX_WARNING_PARTIAL_RESTORE,
131} Fenix_Return_codes;
135#define FENIX_ERRHANDLER_LOC 1
137#define FENIX_FINALIZE_LOC 2
139#define FENIX_DATA_COMMIT_BARRIER_LOC 4
140
168
189
210
258
273
289
302
366#define Fenix_Init(_role, _comm, _newcomm, _argc, _argv, _spare_ranks, _err) \
367 { \
368 static jmp_buf bufjmp; \
369 *(_role) = __fenix_preinit( \
370 _role, _comm, _newcomm, _argc, _argv, _spare_ranks, _err, &bufjmp \
371 ); \
372 setjmp(bufjmp); \
373 __fenix_postinit(); \
374 }
375
381int Fenix_Initialized(int *flag);
382
388int Fenix_Finalized(int *flag);
389
406int Fenix_set_option(Fenix_Setting_name setting, unsigned option);
407
417int Fenix_get_option(Fenix_Setting_name setting, unsigned* option);
418
441int Fenix_Callback_register(void (*recover)(MPI_Comm, int, void *),
442 void *callback_data);
443
449
455
481int Fenix_Process_detect_failures(int do_recovery);
482
485
487int Fenix_get_rank_role(MPI_Comm comm, int rank, int *role);
488
491
493int Fenix_get_error();
494
496int Fenix_get_nspare();
497
503int Fenix_Process_fail_list(int** fail_list);
504
512int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status);
513
514
537int Fenix_Finalize();
538
549#define FENIX_DATA_GROUP_WORLD_ID 10
550#define FENIX_GROUP_ID_MAX 11
551#define FENIX_DATA_MEMBER_ALL -1
552#define FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER 11
553#define FENIX_DATA_MEMBER_ATTRIBUTE_COUNT 12
554#define FENIX_DATA_MEMBER_ATTRIBUTE_DATATYPE 13
555#define FENIX_DATA_MEMBER_ATTRIBUTE_SIZE 14
556#define FENIX_DATA_SNAPSHOT_LATEST -1
557#define FENIX_DATA_SNAPSHOT_ALL -2
558#define FENIX_RESIZEABLE 0
559#define FENIX_DATA_SUBSET_CREATED 2
560#define FENIX_STOREV_ALL -1
561
562#define FENIX_DATA_POLICY_IN_MEMORY_RAID 13
563#define FENIX_DATA_POLICY_IMR FENIX_DATA_POLICY_IN_MEMORY_RAID
564
565#define FENIX_TIME_STAMP_IGNORE NULL
566
570typedef struct {
571 MPI_Request mpi_send_req;
572 MPI_Request mpi_recv_req;
574
575
585typedef struct {
587 void* impl;
589
590
593
596
599
600extern Fenix_Data_subset* FENIX_DATA_SUBSET_IGNORE;
601
633int Fenix_Data_group_create(int group_id, MPI_Comm comm, int start_time_stamp,
634 int depth, int policy_name, void* policy_value,
635 int* flag);
636
644int Fenix_Data_group_created(int group_id);
645
668int Fenix_Data_member_create(int group_id, int member_id, void *buffer,
669 int count, MPI_Datatype datatype);
670
679int Fenix_Data_member_created(int group_id, int member_id);
680
691int Fenix_Data_group_get_redundancy_policy(int group_id, int* policy_name,
692 void *policy_value, int *flag);
693
695int Fenix_Data_wait(Fenix_Request request);
696
697
699int Fenix_Data_test(Fenix_Request request, int *flag);
700
701
719int Fenix_Data_member_stage(int group_id, int member_id,
720 const Fenix_Data_subset subset_specifier);
721
735int Fenix_Data_member_store(int group_id, int member_id,
736 const Fenix_Data_subset subset_specifier);
737
738
740int Fenix_Data_member_storev(int group_id, int member_id,
741 const Fenix_Data_subset subset_specifier);
742
744int Fenix_Data_member_istore(int group_id, int member_id,
745 const Fenix_Data_subset subset_specifier,
746 Fenix_Request *request);
747
749int Fenix_Data_member_istorev(int group_id, int member_id,
750 const Fenix_Data_subset subset_specifier,
751 Fenix_Request *request);
752
771int Fenix_Data_commit(int group_id, int *time_stamp);
772
786int Fenix_Data_commit_barrier(int group_id, int *time_stamp);
787
812int Fenix_Data_checkpoint(int group_id, const Fenix_Data_subset subset,
813 int num_storev, int* storev_ids, int* time_stamp);
814
816int Fenix_Data_barrier(int group_id);
817
842int Fenix_Data_member_restore(int group_id, int member_id, void *target_buffer,
843 int max_count, int time_stamp, Fenix_Data_subset* found_data);
844
859int Fenix_Data_member_lrestore(int group_id, int member_id, void *target_buffer,
860 int max_count, int time_stamp, Fenix_Data_subset* found_data);
861
863int Fenix_Data_member_restore_from_rank(int group_id, int member_id, void *data,
864 int max_count, int time_stamp,
865 Fenix_Data_subset* found_data, int source_rank);
866
890int Fenix_Data_subset_create(int num_blocks, int start_offset, int end_offset,
891 int stride, Fenix_Data_subset *subset_specifier);
892
909int Fenix_Data_subset_createv(int num_blocks, int *array_start_offsets,
910 int *array_end_offsets,
911 Fenix_Data_subset *subset_specifier);
912
921int Fenix_Data_subset_delete(Fenix_Data_subset *subset_specifier);
922
929int Fenix_Data_group_get_number_of_members(int group_id, int *number_of_members);
930
940int Fenix_Data_group_get_member_at_position(int group_id, int *member_id,
941 int position);
942
953 int *number_of_snapshots);
954
966int Fenix_Data_group_get_snapshot_at_position(int group_id, int position,
967 int *time_stamp);
968
970int Fenix_Data_member_attr_get(int group_id, int member_id, int attributename,
971 void *attributevalue, int *flag, int source_rank);
972
990int Fenix_Data_member_attr_set(int group_id, int member_id, int attribute_name,
991 void *attribute_value, int *flag);
992
1001int Fenix_Data_snapshot_delete(int group_id, int time_stamp);
1002
1010int Fenix_Data_group_delete(int group_id);
1011
1020int Fenix_Data_member_delete(int group_id, int member_id);
1030#define FENIX_MLOG_NONE -1
1031#define FENIX_MLOG_CONTINUE -1
1032
1044int Fenix_Mlog_create(int mlog_id, MPI_Comm* comm, int depth);
1045
1064int Fenix_Mlog_activate(int mlog_id);
1065
1073int Fenix_Mlog_active(int* mlog_id);
1074
1085int Fenix_Mlog_begin_region(int mlog_id, int region_id);
1086
1100int Fenix_Mlog_activate_region(int mlog_id, int region_id);
1101
1116int Fenix_Mlog_sync(int mlog_id, int region_id);
1117
1130int Fenix_Mlog_stage(int mlog_id, int group_id, int member_id);
1131
1142int Fenix_Mlog_lrestore(int mlog_id, int group_id, int member_id,
1143 int time_stamp);
1144
1152int Fenix_Mlog_delete(int mlog_id);
1153
1156#if defined(c_plusplus) || defined(__cplusplus)
1157} // extern "C"
1158
1159#include "fenix.hpp"
1160#endif
1161
1162#endif // __FENIX__
int Fenix_Data_group_created(int group_id)
Query if a data group exists on this rank.
const Fenix_Data_subset FENIX_DATA_SUBSET_FULL
A standin for checkpointing/recovering the full member's data.
Definition fenix.cpp:71
int Fenix_Data_checkpoint(int group_id, const Fenix_Data_subset subset, int num_storev, int *storev_ids, int *time_stamp)
Store all members of a group and then commit that group.
Definition fenix.cpp:345
int Fenix_Data_subset_delete(Fenix_Data_subset *subset_specifier)
Delete a data subset.
Definition fenix.cpp:426
int Fenix_Data_group_get_redundancy_policy(int group_id, int *policy_name, void *policy_value, int *flag)
Get the storage policy of a data group.
int Fenix_Data_member_istore(int group_id, int member_id, const Fenix_Data_subset subset_specifier, Fenix_Request *request)
<span class="mlabel"> UNIMPLEMENTED </span> As [store](Fenix_Data_member_store),...
Definition fenix.cpp:313
int Fenix_Data_group_delete(int group_id)
Delete a data group.
Definition fenix.cpp:440
const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY
A standin for checkpointing/recovering no data.
Definition fenix.cpp:72
int Fenix_Data_group_create(int group_id, MPI_Comm comm, int start_time_stamp, int depth, int policy_name, void *policy_value, int *flag)
Create a Data Group.
Definition fenix.cpp:278
int Fenix_Data_member_attr_get(int group_id, int member_id, int attributename, void *attributevalue, int *flag, int source_rank)
<span class="mlabel"> UNIMPLEMENTED </span> Get the value of a member's attribute.
Definition fenix_data_recovery.cpp:398
int Fenix_Data_member_create(int group_id, int member_id, void *buffer, int count, MPI_Datatype datatype)
Create a data member for store/restore operations.
Definition fenix.cpp:289
int Fenix_Data_member_restore(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, Fenix_Data_subset *found_data)
Restore the data of a group member from a snapshot.
Definition fenix.cpp:368
int Fenix_Data_member_istorev(int group_id, int member_id, const Fenix_Data_subset subset_specifier, Fenix_Request *request)
<span class="mlabel"> UNIMPLEMENTED </span> As [istore](Fenix_Data_member_istore),...
Definition fenix.cpp:322
int Fenix_Data_snapshot_delete(int group_id, int time_stamp)
Delete a snapshot from a data group.
Definition fenix.cpp:434
int Fenix_Data_member_storev(int group_id, int member_id, const Fenix_Data_subset subset_specifier)
<span class="mlabel"> UNIMPLEMENTED </span> As [store](Fenix_Data_member_store),...
Definition fenix.cpp:305
int Fenix_Data_member_lrestore(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, Fenix_Data_subset *found_data)
Local-only version of Fenix_Data_member_restore.
Definition fenix.cpp:386
int Fenix_Data_group_get_number_of_snapshots(int group_id, int *number_of_snapshots)
Get the number of locally-available snapshots in a data group.
Definition fenix_data_recovery.cpp:384
const Fenix_Data_subset FENIX_DATA_SUBSET_PRESTAGED
A standin for checkpointing/recovering all of pre-staged data.
Definition fenix.cpp:73
int Fenix_Data_member_created(int group_id, int member_id)
Query if a data member exists on this rank.
int Fenix_Data_group_get_number_of_members(int group_id, int *number_of_members)
Get the number of members in a data group.
Definition fenix_data_recovery.cpp:433
int Fenix_Data_group_get_member_at_position(int group_id, int *member_id, int position)
Get member ID based on member index.
Definition fenix_data_recovery.cpp:440
int Fenix_Data_wait(Fenix_Request request)
<span class="mlabel"> UNIMPLEMENTED </span> Block on completion of the store operation specified b...
Definition fenix_data_recovery.cpp:455
int Fenix_Data_test(Fenix_Request request, int *flag)
<span class="mlabel"> UNIMPLEMENTED </span> Query completion of the store operation specified by t...
Definition fenix_data_recovery.cpp:463
int Fenix_Data_member_restore_from_rank(int group_id, int member_id, void *data, int max_count, int time_stamp, Fenix_Data_subset *found_data, int source_rank)
<span class="mlabel"> UNIMPLEMENTED </span> As Fenix_Data_member_restore, but restores from a spec...
int Fenix_Data_subset_createv(int num_blocks, int *array_start_offsets, int *array_end_offsets, Fenix_Data_subset *subset_specifier)
As Fenix_Data_subset_create, but with varying start and end offsets.
Definition fenix.cpp:415
int Fenix_Data_member_store(int group_id, int member_id, const Fenix_Data_subset subset_specifier)
Store a particular group member into the group's resilient storage space, in uncommitted storage.
Definition fenix.cpp:297
int Fenix_Data_member_delete(int group_id, int member_id)
Delete a data member.
Definition fenix.cpp:446
int Fenix_Data_commit_barrier(int group_id, int *time_stamp)
As commit, but ensures a globally consistent commit.
Definition fenix.cpp:339
int Fenix_Data_member_attr_set(int group_id, int member_id, int attribute_name, void *attribute_value, int *flag)
Set the value of a member's attribute.
Definition fenix_data_recovery.cpp:345
int Fenix_Data_member_stage(int group_id, int member_id, const Fenix_Data_subset subset_specifier)
Serialize a group member's data into the member's local store.
int Fenix_Data_commit(int group_id, int *time_stamp)
Commit stored data members to the group's next snapshot.
Definition fenix.cpp:333
int Fenix_Data_group_get_snapshot_at_position(int group_id, int position, int *time_stamp)
Get the time stamp of a snapshot at a given index.
Definition fenix_data_recovery.cpp:390
int Fenix_Data_barrier(int group_id)
<span class="mlabel"> UNIMPLEMENTED </span> Block until all ranks in the group have reached this p...
Definition fenix.cpp:362
int Fenix_Data_subset_create(int num_blocks, int start_offset, int end_offset, int stride, Fenix_Data_subset *subset_specifier)
Create a data subset for use in store operations.
Definition fenix.cpp:404
int Fenix_Mlog_delete(int mlog_id)
Delete an mlog.
Definition fenix.cpp:503
int Fenix_Mlog_begin_region(int mlog_id, int region_id)
Set the region of the given message logger.
Definition fenix.cpp:471
int Fenix_Mlog_activate_region(int mlog_id, int region_id)
Activate the mlog and begin the region.
Definition fenix.cpp:477
int Fenix_Mlog_sync(int mlog_id, int region_id)
Synchronize messages across ranks each starting at their given region.
Definition fenix.cpp:483
int Fenix_Mlog_active(int *mlog_id)
Get the currently active message log.
Definition fenix.cpp:464
int Fenix_Mlog_create(int mlog_id, MPI_Comm *comm, int depth)
Create a new message logger.
Definition fenix.cpp:452
int Fenix_Mlog_lrestore(int mlog_id, int group_id, int member_id, int time_stamp)
Restore an mlog from a Fenix data member's local snapshot.
Definition fenix.cpp:495
int Fenix_Mlog_stage(int mlog_id, int group_id, int member_id)
Stage an mlog's data into a Fenix data member.
Definition fenix.cpp:489
int Fenix_Mlog_activate(int mlog_id)
Active a given mlog, deactivating any previously active mlog.
Definition fenix.cpp:458
Fenix_Unhandled_mode
Options for dealing with 'unhandled' errors, e.g. invalid rank IDs.
Definition fenix.h:262
int Fenix_get_option(Fenix_Setting_name setting, unsigned *option)
Get the current option for a Fenix setting.
Definition fenix.cpp:91
Fenix_Resume_mode
Options for passing control back to application after recovery.
Definition fenix.h:235
int Fenix_Finalized(int *flag)
Sets flag to true if Fenix_Finalize has been called, else false.
Definition fenix.cpp:126
int Fenix_get_error()
Returns the error value from Fenix_Init or the latest recovery.
Definition fenix.cpp:163
int Fenix_Callback_pop()
Pop the most recently registered callback from the callback stack.
Definition fenix.cpp:109
int Fenix_Callback_register(void(*recover)(MPI_Comm, int, void *), void *callback_data)
Register a callback to be invoked after failure process recovery.
Definition fenix.cpp:98
Fenix_Recovery_mode
Options for recovering after a failed rank is detected.
Definition fenix.h:193
Fenix_Spare_wait_mode
Options for how spare ranks wait to be needed. Must be set before Fenix_Init to take effect.
Definition fenix.h:278
int Fenix_Process_fail_list(int **fail_list)
Get the list of ranks that failed in the most recent failure.
Definition fenix.cpp:131
int Fenix_get_nspare()
Returns the number of spare ranks currently available to Fenix.
Definition fenix.cpp:168
int Fenix_get_number_of_ranks_with_role(int, int *)
<span class="mlabel"> UNIMPLEMENTED </span> Returns the number of ranks with a given Fenix_Rank_ro...
int Fenix_Initialized(int *flag)
Sets flag to true if Fenix_Init has been called, else false.
Definition fenix.cpp:121
int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status)
Check a pre-recovery request without error.
Definition fenix.cpp:138
int Fenix_set_option(Fenix_Setting_name setting, unsigned option)
Configure a global Fenix setting.
Definition fenix.cpp:76
int Fenix_Finalize()
Clean up Fenix state. Each active rank must call Fenix_Finalize before exiting.
Definition fenix_process_recovery.cpp:860
int Fenix_Process_detect_failures(int do_recovery)
Check for any failed ranks.
Definition fenix.cpp:152
Fenix_Rank_role Fenix_get_role()
Returns this rank's Fenix_Rank_role.
Definition fenix.cpp:158
int Fenix_Callback_invoke_all()
Invoke all callbacks with information from the last recovered fault.
Definition fenix.cpp:115
Fenix_Callback_exception_mode
Options for dealing with CommExceptions generated in callbacks.
Definition fenix.h:293
Fenix_Setting_name
Global Fenix settings.
Definition fenix.h:172
Fenix_Mlog_recovery_mode
Definition fenix.h:211
Fenix_Rank_role
All possible roles returned by Fenix_Init.
Definition fenix.h:160
int Fenix_get_rank_role(MPI_Comm comm, int rank, int *role)
<span class="mlabel"> UNIMPLEMENTED </span> Returns the Fenix_Rank_role for a given rank
@ FENIX_UNHANDLED_PRINT
Print error and continue without handling.
Definition fenix.h:266
@ FENIX_UNHANDLED_SILENT
Ignore unhandled errors.
Definition fenix.h:264
@ FENIX_UNHANDLED_ABORT
Print error and abort Fenix's world (default)
Definition fenix.h:268
@ FENIX_UNHANDLED_MODE_MAXCODE
Not a valid option.
Definition fenix.h:271
@ FENIX_RESUME_MODE_MAXCODE
Not a valid option.
Definition fenix.h:256
@ FENIX_RESUME_THROW
Throw a fenix::CommException.
Definition fenix.h:253
@ FENIX_RESUME_JUMP
Return to Fenix_Init via longjmp (default)
Definition fenix.h:249
@ FENIX_RESUME_RETURN
Return the error code inline.
Definition fenix.h:251
@ FENIX_RECOVERY_IGNORE
Do not repair communicator, immediately resume per FENIX_RESUME_MODE.
Definition fenix.h:195
@ FENIX_RECOVERY_REPAIR
Repair the communicator with spares or by shrinking.
Definition fenix.h:203
@ FENIX_RECOVERY_MODE_MAXCODE
Not a valid option.
Definition fenix.h:208
@ FENIX_RECOVERY_SPAWN
<span class="mlabel"> UNIMPLEMENTED </span> As REPAIR, but attempt to respawn failed processes
Definition fenix.h:205
@ FENIX_RECOVERY_NOOP
Do not repair communicator, otherwise behave normally.
Definition fenix.h:201
@ FENIX_SPARE_WAIT_YIELD
Tell MPI to yield this thread while waiting (if supported, else busy wait)
Definition fenix.h:282
@ FENIX_SPARE_WAIT_MODE_MAXCODE
Not a valid option.
Definition fenix.h:287
@ FENIX_SPARE_WAIT_SLEEP
Sleep 100ms between checks to see if this thread is needed for recovery.
Definition fenix.h:284
@ FENIX_SPARE_WAIT_BUSY
Busy wait, consuming CPU time in exchange for faster response.
Definition fenix.h:280
@ FENIX_CALLBACK_EXCEPTION_MODE_MAXCODE
Not a valid option.
Definition fenix.h:300
@ FENIX_CALLBACK_EXCEPTION_SQUASH
CommExceptions from callbacks are squashed.
Definition fenix.h:297
@ FENIX_CALLBACK_EXCEPTION_RETHROW
CommExceptions are allowed to propagate out of callbacks.
Definition fenix.h:295
@ FENIX_SETTING_NAME_MAXCODE
Not a valid option.
Definition fenix.h:187
@ FENIX_RESUME_MODE
See Fenix_Resume_mode.
Definition fenix.h:176
@ FENIX_MLOG_RECOVERY_MODE
See Fenix_Mlog_recovery_mode.
Definition fenix.h:182
@ FENIX_SPARE_WAIT_MODE
See Fenix_Spare_wait_mode.
Definition fenix.h:184
@ FENIX_CALLBACK_EXCEPTION_MODE
See Fenix_Callback_exception_mode.
Definition fenix.h:180
@ FENIX_RECOVERY_MODE
See Fenix_Recovery_mode.
Definition fenix.h:174
@ FENIX_UNHANDLED_MODE
See Fenix_Unhandled_mode.
Definition fenix.h:178
@ FENIX_MLOG_RECOVERY_MANUAL
All message logging recovery is manual.
Definition fenix.h:213
@ FENIX_MLOG_RECOVERY_INLINE_AUTOSYNC
As INLINE, but automatically sync logs with FENIX_MLOG_CONTINUE.
Definition fenix.h:227
@ FENIX_MLOG_RECOVERY_INLINE
Automatically repeats failed, logged MPI operations without disrupting normal application control flo...
Definition fenix.h:220
@ FENIX_MLOG_RECOVERY_MODE_MAXCODE
Not a valid option.
Definition fenix.h:230
@ FENIX_ROLE_RECOVERED_RANK
This rank was a spare before the most recent failure, or was just spawned.
Definition fenix.h:164
@ FENIX_ROLE_INITIAL_RANK
No failures have occurred yet.
Definition fenix.h:162
@ FENIX_ROLE_SURVIVOR_RANK
This rank was not a spare before the most recent failure.
Definition fenix.h:166
Represents a data subset that can be stored/recovered.
Definition fenix.h:585
<span class="mlabel"> UNIMPLEMENTED </span> As MPI_Request, but for Fenix asynchronous data recove...
Definition fenix.h:570