Fenix @develop
 
Loading...
Searching...
No Matches
fenix_ext.hpp
1/*
2//@HEADER
3// ************************************************************************
4//
5//
6// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _|
7// _| _| _|_| _| _| _| _|
8// _|_|_| _|_|_| _| _| _| _| _|
9// _| _| _| _|_| _| _| _|
10// _| _|_|_|_| _| _| _|_|_| _| _|
11//
12//
13//
14//
15// Copyright (C) 2016 Rutgers University and Sandia Corporation
16//
17// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
18// the U.S. Government retains certain rights in this software.
19//
20// Redistribution and use in source and binary forms, with or without
21// modification, are permitted provided that the following conditions are
22// met:
23//
24// 1. Redistributions of source code must retain the above copyright
25// notice, this list of conditions and the following disclaimer.
26//
27// 2. Redistributions in binary form must reproduce the above copyright
28// notice, this list of conditions and the following disclaimer in the
29// documentation and/or other materials provided with the distribution.
30//
31// 3. Neither the name of the Corporation nor the names of the
32// contributors may be used to endorse or promote products derived from
33// this software without specific prior written permission.
34//
35// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
36// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
39// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
40// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
41// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
42// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
43// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
44// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46//
47// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
48// Michael Heroux, and Matthew Whitlock
49//
50// Questions? Contact Keita Teranishi (knteran@sandia.gov) and
51// Marc Gamell (mgamell@cac.rutgers.edu)
52//
53// ************************************************************************
54//@HEADER
55*/
56
57#ifndef __FENIX_EXT_H__
58#define __FENIX_EXT_H__
59
60#include <mpi.h>
61#include <vector>
62#include <unordered_map>
63#include <map>
64#include "fenix.h"
65#include "fenix.hpp"
66#include "fenix_opt.hpp"
67#include "fenix_data_group.hpp"
68#include "fenix/logging/comm_log.h"
69
70namespace fenix {
71
72// Fenix's global settings, configurable before initialization
73struct Settings {
74 RecoveryMode recovery = REPAIR;
75 // Defaults to JUMP if a jump_buf provided, else THROW
77 CallbackExceptionMode cb_exception = SQUASH;
78 UnhandledMode unhandled = ABORT;
79 MlogRecoveryMode mlog_recovery = MANUAL;
80 SpareWaitMode spare_wait = YIELD;
81};
82
83// Configurations before init change this
84inline Settings fenix_default_settings;
85
86struct fenix_t {
87 // Global Fenix settings
88 Settings settings;
89
90 int num_initial_ranks;
91 int num_survivor_ranks = 0; // As of last failure
92 int num_recovered_ranks = 0; // As of last failure
93 int spare_ranks; // Spare ranks entered by user
94
95 jmp_buf* recover_environment; // for FENIX_RESUME_JUMP
96
97 int mpi_fail_code = MPI_SUCCESS;
98 int repair_result = FENIX_SUCCESS; // Result of MPI comm repair
99 int role = FENIX_ROLE_INITIAL_RANK;
100
101 int fenix_init_flag = false;
102 int finalized = false;
103
104 int fail_world_size = 0;
105 int* fail_world = nullptr;
106
107 //Save the pointer to role and error of Fenix_Init
108 int* ret_role = nullptr;
109 int* ret_error = nullptr;
110
111 std::unordered_map<CallbackLocation, std::vector<FenixCallbackFunc>>
112 callbacks;
113 fenix_debug_opt_t options; // This is reserved to store the user options
114
115 MPI_Comm* world; // Duplicate of comm provided by user
116 MPI_Comm* user_world; // User-facing comm with repaired ranks and no spares
117 MPI_Comm new_world; // Internal duplicate of user_world
118 int new_world_exists = false, user_world_exists = false;
119
120 //Values used for Fenix_Process_detect_failures
121 int dummy_recv_buffer;
122 MPI_Request check_failures_req;
123
124 MPI_Op agree_op; // Global agreement call for data recovery API
125 MPI_Errhandler mpi_errhandler; // Our custom error handler
126
127 fenix::data::fenix_data_recovery_t* data_recovery = nullptr;
128
129 // -------------------------
130 // Message logging variables
131 // -------------------------
132
133 // All loggers indexed by ID
134 std::unordered_map<int, std::shared_ptr<logging::CommLog>> mlogs;
135 // Order of creation of all existing mlogs
136 std::vector<int> mlog_order;
137 // Active log (if any)
138 std::shared_ptr<logging::CommLog> active_mlog;
139 int active_mlog_id = FENIX_MLOG_NONE;
140
141 // Maps of UID to MPI internal types for logging
142 std::map<int, MPI_Datatype> mpi_types;
143 std::map<int, MPI_Op> mpi_ops;
144
145 static inline bool mpi_overloads_linked = false;
146};
147
148inline fenix::fenix_t fenix_rt;
149} // namespace fenix
150#endif // __FENIX_EXT_H__
Contains all API function calls and Fenix types. This is the only header file a user should include.
Fenix_Unhandled_mode
Options for dealing with 'unhandled' errors, e.g. invalid rank IDs.
Definition fenix.h:262
Fenix_Resume_mode
Options for passing control back to application after recovery.
Definition fenix.h:235
Fenix_Recovery_mode
Options for recovering after a failed rank is detected.
Definition fenix.h:193
Fenix_Spare_wait_mode
Options for how spare ranks wait to be needed. Must be set before Fenix_Init to take effect.
Definition fenix.h:278
Fenix_Callback_exception_mode
Options for dealing with CommExceptions generated in callbacks.
Definition fenix.h:293
Fenix_Mlog_recovery_mode
Definition fenix.h:211
@ FENIX_RESUME_MODE_MAXCODE
Not a valid option.
Definition fenix.h:256
@ FENIX_ROLE_INITIAL_RANK
No failures have occurred yet.
Definition fenix.h:162
Definition fenix_opt.hpp:130
Definition fenix_ext.hpp:73
Definition fenix_data_group.hpp:125
Definition fenix_ext.hpp:86