Fenix @develop
 
Loading...
Searching...
No Matches
fenix.hpp
1/*
2//@HEADER
3// ************************************************************************
4//
5//
6// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _|
7// _| _| _|_| _| _| _| _|
8// _|_|_| _|_|_| _| _| _| _| _|
9// _| _| _| _|_| _| _| _|
10// _| _|_|_|_| _| _| _|_|_| _| _|
11//
12//
13//
14//
15// Copyright (C) 2016 Rutgers University and Sandia Corporation
16//
17// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
18// the U.S. Government retains certain rights in this software.
19//
20// Redistribution and use in source and binary forms, with or without
21// modification, are permitted provided that the following conditions are
22// met:
23//
24// 1. Redistributions of source code must retain the above copyright
25// notice, this list of conditions and the following disclaimer.
26//
27// 2. Redistributions in binary form must reproduce the above copyright
28// notice, this list of conditions and the following disclaimer in the
29// documentation and/or other materials provided with the distribution.
30//
31// 3. Neither the name of the Corporation nor the names of the
32// contributors may be used to endorse or promote products derived from
33// this software without specific prior written permission.
34//
35// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
36// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
39// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
40// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
41// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
42// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
43// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
44// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46//
47// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
48// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock
49//
50// Questions? Contact Keita Teranishi (knteran@sandia.gov) and
51// Marc Gamell (mgamell@cac.rutgers.edu)
52//
53// ************************************************************************
54//@HEADER
55*/
56
57#ifndef __FENIX_HPP__
58#define __FENIX_HPP__
59
60#include <mpi.h>
61#include <functional>
62#include <vector>
63#include <optional>
64#include "fenix.h"
65#include "fenix_exception.hpp"
66#include "fenix_data_subset.hpp"
67
68namespace fenix {
69
70using Role = Fenix_Rank_role;
71constexpr Role INITIAL_RANK = FENIX_ROLE_INITIAL_RANK;
72constexpr Role RECOVERED_RANK = FENIX_ROLE_RECOVERED_RANK;
73constexpr Role SURVIVOR_RANK = FENIX_ROLE_SURVIVOR_RANK;
74
75using SettingName = Fenix_Setting_name;
76constexpr SettingName RECOVERY_MODE = FENIX_RECOVERY_MODE;
77constexpr SettingName RESUME_MODE = FENIX_RESUME_MODE;
78constexpr SettingName UNHANDLED_MODE = FENIX_UNHANDLED_MODE;
79constexpr SettingName CALLBACK_EXCEPTION_MODE = FENIX_CALLBACK_EXCEPTION_MODE;
80constexpr SettingName MLOG_RECOVERY_MODE = FENIX_MLOG_RECOVERY_MODE;
81constexpr SettingName SPARE_WAIT_MODE = FENIX_SPARE_WAIT_MODE;
82
83using RecoveryMode = Fenix_Recovery_mode;
84constexpr RecoveryMode IGNORE = FENIX_RECOVERY_IGNORE;
85constexpr RecoveryMode NOOP = FENIX_RECOVERY_NOOP;
86constexpr RecoveryMode REPAIR = FENIX_RECOVERY_REPAIR;
87constexpr RecoveryMode SPAWN = FENIX_RECOVERY_SPAWN;
88
89using ResumeMode = Fenix_Resume_mode;
90constexpr ResumeMode JUMP = FENIX_RESUME_JUMP;
91constexpr ResumeMode RETURN = FENIX_RESUME_RETURN;
92constexpr ResumeMode THROW = FENIX_RESUME_THROW;
93
94using UnhandledMode = Fenix_Unhandled_mode;
95constexpr UnhandledMode SILENT = FENIX_UNHANDLED_SILENT;
96constexpr UnhandledMode PRINT = FENIX_UNHANDLED_PRINT;
97constexpr UnhandledMode ABORT = FENIX_UNHANDLED_ABORT;
98
99using CallbackExceptionMode = Fenix_Callback_exception_mode;
100constexpr CallbackExceptionMode RETHROW = FENIX_CALLBACK_EXCEPTION_RETHROW;
101constexpr CallbackExceptionMode SQUASH = FENIX_CALLBACK_EXCEPTION_SQUASH;
102
103using MlogRecoveryMode = Fenix_Mlog_recovery_mode;
104constexpr MlogRecoveryMode MANUAL = FENIX_MLOG_RECOVERY_MANUAL;
105constexpr MlogRecoveryMode INLINE = FENIX_MLOG_RECOVERY_INLINE;
106constexpr MlogRecoveryMode INLINE_AUTOSYNC =
108
109using SpareWaitMode = Fenix_Spare_wait_mode;
110constexpr SpareWaitMode BUSY = FENIX_SPARE_WAIT_BUSY;
111constexpr SpareWaitMode YIELD = FENIX_SPARE_WAIT_YIELD;
112constexpr SpareWaitMode SLEEP = FENIX_SPARE_WAIT_SLEEP;
113
114constexpr int STOREV_ALL = FENIX_STOREV_ALL;
115
116enum CallbackLocation { PRE_RECOVERY, POST_RECOVERY };
117
118namespace args {
120 int* role = nullptr;
121 MPI_Comm in_comm = MPI_COMM_WORLD;
122 MPI_Comm* out_comm = nullptr;
123 int* argc = nullptr;
124 char*** argv = nullptr;
125 int spares = 0;
126 int* err = nullptr;
127};
128}
129
130void init(const args::FenixInitArgs args);
131
133void set_option(SettingName setting, unsigned option);
134
136unsigned get_option(SettingName setting);
137
139void throw_exception();
140
142Fenix_Rank_role role();
143
145int error();
146
148int nspare();
149
150using FenixCallbackFunc = std::function<void(MPI_Comm, int)>;
151
153int callback_register(
154 FenixCallbackFunc callback, CallbackLocation loc = POST_RECOVERY
155);
156
157//@!brief Overload of #Fenix_Callback_pop
158int callback_pop(CallbackLocation loc = POST_RECOVERY);
159
160//@!brief Overload of #Fenix_Callback_invoke_all
161int callback_invoke_all(CallbackLocation loc = POST_RECOVERY);
162
167std::vector<int> fail_list();
168
170int detect_failures(bool recover = true);
171
173bool initialized();
174
176bool finalized();
177
178} // namespace fenix
179
180namespace fenix::data {
181
182extern const DataSubset& SUBSET_FULL;
183extern const DataSubset& SUBSET_EMPTY;
184extern const DataSubset& SUBSET_PRESTAGED;
185extern DataSubset SUBSET_IGNORE;
186
187//@!brief Overload of #Fenix_Data_group_create
188int group_create(
189 int group_id, MPI_Comm comm, int start_time_stamp, int depth, int policy_name,
190 void* policy_value, int* flag
191);
192
194 // MPI_COMM_NULL defaults to the resilient communicator
195 MPI_Comm comm = MPI_COMM_NULL;
196 int start_time_stamp = 0;
197 int depth = 1;
198 int policy_name = FENIX_DATA_POLICY_IMR;
199 void* policy_value = nullptr;
200 int* flag = nullptr;
201};
202int group_create(int group_id, GroupCreateArgs args = {});
203
204//@!brief Overload of #Fenix_Data_group_created
205bool group_created(int group_id);
206
207//@!brief Overload of #Fenix_Data_member_create
208int member_create(
209 int group_id, int member_id, void* buffer, int count, MPI_Datatype datatype
210);
211
212//@!brief Overload of #Fenix_Data_member_created
213bool member_created(int group_id, int member_id);
214
216int member_stage(
217 int group_id, int member_id, const DataSubset& subset = SUBSET_FULL
218);
219
221int member_store(
222 int group_id, int member_id, const DataSubset& subset = SUBSET_FULL
223);
225inline int member_store(int group_id, const DataSubset& subset = SUBSET_FULL) {
226 return member_store(group_id, FENIX_DATA_MEMBER_ALL, subset);
227}
228
230int member_storev(int group_id, int member_id, const DataSubset& subset);
232inline int member_storev(int group_id, const DataSubset& subset) {
233 return member_storev(group_id, FENIX_DATA_MEMBER_ALL, subset);
234}
235
237int member_istore(
238 int group_id, int member_id, const DataSubset& subset, Fenix_Request* request
239);
240
242 int member_id = FENIX_DATA_MEMBER_ALL;
243 const DataSubset& subset = SUBSET_FULL;
244};
246inline int member_istore(
247 int group_id, Fenix_Request* request, MemberIstoreArgs args = {}
248) {
249 return member_istore(group_id, args.member_id, args.subset, request);
250}
251
253int member_istorev(
254 int group_id, int member_id, const DataSubset& subset, Fenix_Request* request
255);
257inline int member_istorev(
258 int group_id, const DataSubset& subset, Fenix_Request* request
259) {
260 return member_istorev(group_id, FENIX_DATA_MEMBER_ALL, subset, request);
261}
262
264int member_restore(
265 int group_id, int member_id, void* target_buffer, int max_length,
266 int time_stamp = FENIX_DATA_SNAPSHOT_LATEST,
267 DataSubset& data_found = SUBSET_IGNORE
268);
269
271int member_lrestore(
272 int group_id, int member_id, void* target_buffer, int max_length,
273 int time_stamp, DataSubset& data_found
274);
275
277int commit(int group_id, int* time_stamp = nullptr);
278
280int commit_barrier(int group_id, int* time_stamp = nullptr);
281
283int checkpoint(
284 int group_id, const DataSubset& subset,
285 const std::vector<int>& storev_ids = {}, int* time_stamp = nullptr
286);
287
289int checkpointv(
290 int group_id, const DataSubset& subset, int* time_stamp = nullptr
291);
292
297std::optional<std::vector<int>> group_members(int group_id);
298
303std::optional<std::vector<int>> group_snapshots(int group_id);
304
305//@!brief Overload of #Fenix_Data_snapshot_delete
306int snapshot_delete(int group_id, int timestamp);
307
308//@!brief Overload of #Fenix_Data_group_delete
309int group_delete(int group_id);
310
311//@!brief Overload of #Fenix_Data_member_delete
312int member_delete(int group_id, int member_id);
313
314} // namespace fenix::data
315
316namespace fenix::mlog {
317
318//@brief Overload of #Fenix_Mlog_create
319int create(int mlog_id, MPI_Comm& comm, int depth);
320
321//@brief Overload of #Fenix_Mlog_activate
322int activate(int mlog_id);
323
324//@brief Overload of Fenix_Mlog_active, returns active log
325int active();
326
327//@brief Overload of #Fenix_Mlog_begin_region
328int begin_region(int mlog_id, int region_id);
329
330//@brief Overload of #Fenix_Mlog_activate_region
331int activate(int mlog_id, int region_id);
332
333//@brief Overload of #Fenix_Mlog_sync
334int sync(int mlog_id, int region_id = FENIX_MLOG_CONTINUE);
335
336//@brief Overload of #Fenix_Mlog_stage
337int stage(int mlog_id, int group_id, int member_id);
338
339//@brief Overload of #Fenix_Mlog_lrestore
340int lrestore(
341 int mlog_id, int group_id, int member_id,
342 int time_stamp = FENIX_DATA_SNAPSHOT_LATEST
343);
344
345//@brief Overload of #Fenix_Mlog_delete
346int mlog_delete(int mlog_id);
347
348} // namespace fenix::mlog
349
350#endif
Contains all API function calls and Fenix types. This is the only header file a user should include.
Fenix_Unhandled_mode
Options for dealing with 'unhandled' errors, e.g. invalid rank IDs.
Definition fenix.h:262
Fenix_Resume_mode
Options for passing control back to application after recovery.
Definition fenix.h:235
Fenix_Recovery_mode
Options for recovering after a failed rank is detected.
Definition fenix.h:193
Fenix_Spare_wait_mode
Options for how spare ranks wait to be needed. Must be set before Fenix_Init to take effect.
Definition fenix.h:278
Fenix_Callback_exception_mode
Options for dealing with CommExceptions generated in callbacks.
Definition fenix.h:293
Fenix_Setting_name
Global Fenix settings.
Definition fenix.h:172
Fenix_Mlog_recovery_mode
Definition fenix.h:211
Fenix_Rank_role
All possible roles returned by Fenix_Init.
Definition fenix.h:160
@ FENIX_UNHANDLED_PRINT
Print error and continue without handling.
Definition fenix.h:266
@ FENIX_UNHANDLED_SILENT
Ignore unhandled errors.
Definition fenix.h:264
@ FENIX_UNHANDLED_ABORT
Print error and abort Fenix's world (default)
Definition fenix.h:268
@ FENIX_RESUME_THROW
Throw a fenix::CommException.
Definition fenix.h:253
@ FENIX_RESUME_JUMP
Return to Fenix_Init via longjmp (default)
Definition fenix.h:249
@ FENIX_RESUME_RETURN
Return the error code inline.
Definition fenix.h:251
@ FENIX_RECOVERY_IGNORE
Do not repair communicator, immediately resume per FENIX_RESUME_MODE.
Definition fenix.h:195
@ FENIX_RECOVERY_REPAIR
Repair the communicator with spares or by shrinking.
Definition fenix.h:203
@ FENIX_RECOVERY_SPAWN
<span class="mlabel"> UNIMPLEMENTED </span> As REPAIR, but attempt to respawn failed processes
Definition fenix.h:205
@ FENIX_RECOVERY_NOOP
Do not repair communicator, otherwise behave normally.
Definition fenix.h:201
@ FENIX_SPARE_WAIT_YIELD
Tell MPI to yield this thread while waiting (if supported, else busy wait)
Definition fenix.h:282
@ FENIX_SPARE_WAIT_SLEEP
Sleep 100ms between checks to see if this thread is needed for recovery.
Definition fenix.h:284
@ FENIX_SPARE_WAIT_BUSY
Busy wait, consuming CPU time in exchange for faster response.
Definition fenix.h:280
@ FENIX_CALLBACK_EXCEPTION_SQUASH
CommExceptions from callbacks are squashed.
Definition fenix.h:297
@ FENIX_CALLBACK_EXCEPTION_RETHROW
CommExceptions are allowed to propagate out of callbacks.
Definition fenix.h:295
@ FENIX_RESUME_MODE
See Fenix_Resume_mode.
Definition fenix.h:176
@ FENIX_MLOG_RECOVERY_MODE
See Fenix_Mlog_recovery_mode.
Definition fenix.h:182
@ FENIX_SPARE_WAIT_MODE
See Fenix_Spare_wait_mode.
Definition fenix.h:184
@ FENIX_CALLBACK_EXCEPTION_MODE
See Fenix_Callback_exception_mode.
Definition fenix.h:180
@ FENIX_RECOVERY_MODE
See Fenix_Recovery_mode.
Definition fenix.h:174
@ FENIX_UNHANDLED_MODE
See Fenix_Unhandled_mode.
Definition fenix.h:178
@ FENIX_MLOG_RECOVERY_MANUAL
All message logging recovery is manual.
Definition fenix.h:213
@ FENIX_MLOG_RECOVERY_INLINE_AUTOSYNC
As INLINE, but automatically sync logs with FENIX_MLOG_CONTINUE.
Definition fenix.h:227
@ FENIX_MLOG_RECOVERY_INLINE
Automatically repeats failed, logged MPI operations without disrupting normal application control flo...
Definition fenix.h:220
@ FENIX_ROLE_RECOVERED_RANK
This rank was a spare before the most recent failure, or was just spawned.
Definition fenix.h:164
@ FENIX_ROLE_INITIAL_RANK
No failures have occurred yet.
Definition fenix.h:162
@ FENIX_ROLE_SURVIVOR_RANK
This rank was not a spare before the most recent failure.
Definition fenix.h:166
<span class="mlabel"> UNIMPLEMENTED </span> As MPI_Request, but for Fenix asynchronous data recove...
Definition fenix.h:570
Definition fenix_data_subset.hpp:137
Definition fenix.hpp:119
Definition fenix.hpp:193
Definition fenix.hpp:241