Fenix @develop
 
Loading...
Searching...
No Matches
fenix.h
Go to the documentation of this file.
1/*
2//@HEADER
3// ************************************************************************
4//
5//
6// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _|
7// _| _| _|_| _| _| _| _|
8// _|_|_| _|_|_| _| _| _| _| _|
9// _| _| _| _|_| _| _| _|
10// _| _|_|_|_| _| _| _|_|_| _| _|
11//
12//
13//
14//
15// Copyright (C) 2016 Rutgers University and Sandia Corporation
16//
17// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
18// the U.S. Government retains certain rights in this software.
19//
20// Redistribution and use in source and binary forms, with or without
21// modification, are permitted provided that the following conditions are
22// met:
23//
24// 1. Redistributions of source code must retain the above copyright
25// notice, this list of conditions and the following disclaimer.
26//
27// 2. Redistributions in binary form must reproduce the above copyright
28// notice, this list of conditions and the following disclaimer in the
29// documentation and/or other materials provided with the distribution.
30//
31// 3. Neither the name of the Corporation nor the names of the
32// contributors may be used to endorse or promote products derived from
33// this software without specific prior written permission.
34//
35// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
36// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
39// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
40// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
41// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
42// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
43// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
44// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46//
47// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
48// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock
49//
50// Questions? Contact Keita Teranishi (knteran@sandia.gov) and
51// Marc Gamell (mgamell@cac.rutgers.edu)
52//
53// ************************************************************************
54//@HEADER
55*/
56
57#ifndef __FENIX__
58#define __FENIX__
59
60#include <mpi.h>
61#include <setjmp.h>
62
63#if defined(c_plusplus) || defined(__cplusplus)
64#include "fenix.hpp"
65
66extern "C" {
67#endif
68
69
70#include "fenix_data_subset.h"
71#include "fenix_init.h"
72
85#define FENIX_SUCCESS 0
86#define FENIX_ERROR_UNINITIALIZED -9
87#define FENIX_ERROR_NOCATEGORY -10
88#define FENIX_ERROR_CALLBACK_NOT_REGISTERED -11
89#define FENIX_ERROR_GROUP_CREATE -12
90#define FENIX_ERROR_MEMBER_CREATE -13
91#define FENIX_ERROR_COMMIT_BARRIER -133
92#define FENIX_ERROR_INVALID_GROUPID -14
93#define FENIX_ERROR_INVALID_MEMBERID -15
94#define FENIX_ERROR_INVALID_LOGIC_CALL -155
95#define FENIX_ERROR_INVALID_TIMESTAMP -16
96#define FENIX_ERROR_INVALID_DEPTH -17
97#define FENIX_ERROR_INVALID_ATTRIBUTE_NAME -18
98#define FENIX_ERROR_INVALID_ATTRIBUTE_VALUE -19
99#define FENIX_ERROR_INVALID_POSITION -20
100#define FENIX_ERROR_DATA_WAIT -21
101#define FENIX_ERROR_SUBSET_NUM_BLOCKS -22
102#define FENIX_ERROR_SUBSET_START_OFFSET -23
103#define FENIX_ERROR_SUBSET_END_OFFSET -24
104#define FENIX_ERROR_SUBSET_STRIDE -25
105#define FENIX_ERROR_NODATA_FOUND -30
106#define FENIX_ERROR_INTERN -40
107#define FENIX_ERROR_CANCELLED -50
108#define FENIX_WARNING_SPARE_RANKS_DEPLETED 100
109#define FENIX_WARNING_PARTIAL_RESTORE 101
113#define FENIX_ERRHANDLER_LOC 1
115#define FENIX_FINALIZE_LOC 2
117#define FENIX_DATA_COMMIT_BARRIER_LOC 4
118
119
120
148
217#define Fenix_Init(_role, _comm, _newcomm, _argc, _argv, _spare_ranks, \
218 _spawn, _info, _error) \
219 { \
220 static jmp_buf bufjmp; \
221 *(_role) = __fenix_preinit(_role, _comm, _newcomm, _argc, \
222 _argv, _spare_ranks, _spawn, _info, \
223 _error, &bufjmp); \
224 if(setjmp(bufjmp)) { \
225 *(_role) = FENIX_ROLE_SURVIVOR_RANK; \
226 } \
227 __fenix_postinit( _error ); \
228 }
229
230
236int Fenix_Initialized(int *flag);
237
258int Fenix_Callback_register(void (*recover)(MPI_Comm, int, void *),
259 void *callback_data);
260
266
274int Fenix_Process_detect_failures(int do_recovery);
275
278
280int Fenix_get_role(MPI_Comm comm, int rank, int *role);
281
287int Fenix_Process_fail_list(int** fail_list);
288
296int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status);
297
298
317int Fenix_Finalize();
318
329#define FENIX_DATA_GROUP_WORLD_ID 10
330#define FENIX_GROUP_ID_MAX 11
331#define FENIX_TIME_STAMP_MAX 12
332#define FENIX_DATA_MEMBER_ALL 15
333#define FENIX_DATA_MEMBER_ATTRIBUTE_BUFFER 11
334#define FENIX_DATA_MEMBER_ATTRIBUTE_COUNT 12
335#define FENIX_DATA_MEMBER_ATTRIBUTE_DATATYPE 13
336#define FENIX_DATA_MEMBER_ATTRIBUTE_SIZE 14
337#define FENIX_DATA_SNAPSHOT_LATEST -1
338#define FENIX_DATA_SNAPSHOT_ALL 16
339#define FENIX_DATA_SUBSET_CREATED 2
340
341#define FENIX_DATA_POLICY_IN_MEMORY_RAID 13
342
346typedef struct {
347 MPI_Request mpi_send_req;
348 MPI_Request mpi_recv_req;
350
353
356
357
389int Fenix_Data_group_create(int group_id, MPI_Comm comm, int start_time_stamp,
390 int depth, int policy_name, void* policy_value,
391 int* flag);
392
415int Fenix_Data_member_create(int group_id, int member_id, void *buffer,
416 int count, MPI_Datatype datatype);
417
428int Fenix_Data_group_get_redundancy_policy(int group_id, int* policy_name,
429 void *policy_value, int *flag);
430
432int Fenix_Data_wait(Fenix_Request request);
433
434
436int Fenix_Data_test(Fenix_Request request, int *flag);
437
438
452int Fenix_Data_member_store(int group_id, int member_id,
453 Fenix_Data_subset subset_specifier);
454
455
457int Fenix_Data_member_storev(int group_id, int member_id,
458 Fenix_Data_subset subset_specifier);
459
461int Fenix_Data_member_istore(int group_id, int member_id,
462 Fenix_Data_subset subset_specifier,
463 Fenix_Request *request);
464
466int Fenix_Data_member_istorev(int group_id, int member_id,
467 Fenix_Data_subset subset_specifier,
468 Fenix_Request *request);
469
488int Fenix_Data_commit(int group_id, int *time_stamp);
489
503int Fenix_Data_commit_barrier(int group_id, int *time_stamp);
504
506int Fenix_Data_barrier(int group_id);
507
532int Fenix_Data_member_restore(int group_id, int member_id, void *target_buffer,
533 int max_count, int time_stamp, Fenix_Data_subset* found_data);
534
549int Fenix_Data_member_lrestore(int group_id, int member_id, void *target_buffer,
550 int max_count, int time_stamp, Fenix_Data_subset* found_data);
551
553int Fenix_Data_member_restore_from_rank(int group_id, int member_id, void *data,
554 int max_count, int time_stamp,
555 Fenix_Data_subset* found_data, int source_rank);
556
580int Fenix_Data_subset_create(int num_blocks, int start_offset, int end_offset,
581 int stride, Fenix_Data_subset *subset_specifier);
582
599int Fenix_Data_subset_createv(int num_blocks, int *array_start_offsets,
600 int *array_end_offsets,
601 Fenix_Data_subset *subset_specifier);
602
611int Fenix_Data_subset_delete(Fenix_Data_subset *subset_specifier);
612
614int Fenix_Data_group_get_number_of_members(int group_id, int *number_of_members);
615
617int Fenix_Data_group_get_member_at_position(int group_id, int *member_id,
618 int position);
619
630 int *number_of_snapshots);
631
643int Fenix_Data_group_get_snapshot_at_position(int group_id, int position,
644 int *time_stamp);
645
647int Fenix_Data_member_attr_get(int group_id, int member_id, int attributename,
648 void *attributevalue, int *flag, int source_rank);
649
667int Fenix_Data_member_attr_set(int group_id, int member_id, int attribute_name,
668 void *attribute_value, int *flag);
669
678int Fenix_Data_snapshot_delete(int group_id, int time_stamp);
679
687int Fenix_Data_group_delete(int group_id);
688
697int Fenix_Data_member_delete(int group_id, int member_id);
700#if defined(c_plusplus) || defined(__cplusplus)
701}
702#endif
703
704#endif // __FENIX__
int Fenix_Data_member_istore(int group_id, int member_id, Fenix_Data_subset subset_specifier, Fenix_Request *request)
<span class="mlabel"> UNIMPLEMENTED </span> As [store](Fenix_Data_member_store),...
Definition fenix.cpp:119
const Fenix_Data_subset FENIX_DATA_SUBSET_FULL
A standin for checkpointing/recovering all available data in a member.
Definition fenix.cpp:63
int Fenix_Data_subset_delete(Fenix_Data_subset *subset_specifier)
Delete a data subset.
Definition fenix.cpp:159
int Fenix_Data_group_get_redundancy_policy(int group_id, int *policy_name, void *policy_value, int *flag)
Get the storage policy of a data group.
Definition fenix.cpp:99
int Fenix_Data_group_delete(int group_id)
Delete a data group.
Definition fenix.cpp:191
const Fenix_Data_subset FENIX_DATA_SUBSET_EMPTY
A standin for checkpointing/recovering none of the available data in a member.
Definition fenix.cpp:64
int Fenix_Data_group_create(int group_id, MPI_Comm comm, int start_time_stamp, int depth, int policy_name, void *policy_value, int *flag)
Create a Data Group.
Definition fenix.cpp:90
int Fenix_Data_member_attr_get(int group_id, int member_id, int attributename, void *attributevalue, int *flag, int source_rank)
<span class="mlabel"> UNIMPLEMENTED </span> Get the value of a member's attribute.
Definition fenix.cpp:179
int Fenix_Data_member_storev(int group_id, int member_id, Fenix_Data_subset subset_specifier)
<span class="mlabel"> UNIMPLEMENTED </span> As [store](Fenix_Data_member_store),...
Definition fenix.cpp:115
int Fenix_Data_member_create(int group_id, int member_id, void *buffer, int count, MPI_Datatype datatype)
Create a data member for store/restore operations.
Definition fenix.cpp:95
int Fenix_Data_member_restore(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, Fenix_Data_subset *found_data)
Restore the data of a group member from a snapshot.
Definition fenix.cpp:139
int Fenix_Data_snapshot_delete(int group_id, int time_stamp)
Delete a snapshot from a data group.
Definition fenix.cpp:187
int Fenix_Data_member_lrestore(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, Fenix_Data_subset *found_data)
Local-only version of Fenix_Data_member_restore.
Definition fenix.cpp:143
int Fenix_Data_group_get_number_of_snapshots(int group_id, int *number_of_snapshots)
Get the number of locally-available snapshots in a data group.
Definition fenix.cpp:171
int Fenix_Data_group_get_number_of_members(int group_id, int *number_of_members)
<span class="mlabel"> UNIMPLEMENTED </span> Get the number of members in a data group.
Definition fenix.cpp:163
int Fenix_Data_group_get_member_at_position(int group_id, int *member_id, int position)
<span class="mlabel"> UNIMPLEMENTED </span> Get member ID based on member index
Definition fenix.cpp:167
int Fenix_Data_wait(Fenix_Request request)
<span class="mlabel"> UNIMPLEMENTED </span> Block on completion of the store operation specified b...
Definition fenix.cpp:103
int Fenix_Data_test(Fenix_Request request, int *flag)
<span class="mlabel"> UNIMPLEMENTED </span> Query completion of the store operation specified by t...
Definition fenix.cpp:107
int Fenix_Data_member_restore_from_rank(int group_id, int member_id, void *data, int max_count, int time_stamp, Fenix_Data_subset *found_data, int source_rank)
<span class="mlabel"> UNIMPLEMENTED </span> As Fenix_Data_member_restore, but restores from a spec...
Definition fenix.cpp:147
int Fenix_Data_subset_createv(int num_blocks, int *array_start_offsets, int *array_end_offsets, Fenix_Data_subset *subset_specifier)
As Fenix_Data_subset_create, but with varying start and end offsets.
Definition fenix.cpp:155
int Fenix_Data_member_store(int group_id, int member_id, Fenix_Data_subset subset_specifier)
Store a particular group member into the group's resilient storage space, in uncommitted storage.
Definition fenix.cpp:111
int Fenix_Data_member_delete(int group_id, int member_id)
Delete a data member.
Definition fenix.cpp:195
int Fenix_Data_commit_barrier(int group_id, int *time_stamp)
As commit, but ensures a globally consistent commit.
Definition fenix.cpp:131
int Fenix_Data_member_attr_set(int group_id, int member_id, int attribute_name, void *attribute_value, int *flag)
Set the value of a member's attribute.
Definition fenix.cpp:183
int Fenix_Data_member_istorev(int group_id, int member_id, Fenix_Data_subset subset_specifier, Fenix_Request *request)
<span class="mlabel"> UNIMPLEMENTED </span> As [istore](Fenix_Data_member_istore),...
Definition fenix.cpp:123
int Fenix_Data_commit(int group_id, int *time_stamp)
Commit stored data members to the group's next snapshot.
Definition fenix.cpp:127
int Fenix_Data_group_get_snapshot_at_position(int group_id, int position, int *time_stamp)
Get the time stamp of a snapshot at a given index.
Definition fenix.cpp:175
int Fenix_Data_barrier(int group_id)
<span class="mlabel"> UNIMPLEMENTED </span> Block until all ranks in the group have reached this p...
Definition fenix.cpp:135
int Fenix_Data_subset_create(int num_blocks, int start_offset, int end_offset, int stride, Fenix_Data_subset *subset_specifier)
Create a data subset for use in store operations.
Definition fenix.cpp:151
int Fenix_Callback_pop()
Pop the most recently registered callback from the callback stack.
Definition fenix.cpp:76
int Fenix_Callback_register(void(*recover)(MPI_Comm, int, void *), void *callback_data)
Register a callback to be invoked after failure process recovery.
Definition fenix.cpp:70
int Fenix_Process_fail_list(int **fail_list)
Get the list of ranks that failed in the most recent failure.
Definition fenix.cpp:199
int Fenix_get_number_of_ranks_with_role(int, int *)
<span class="mlabel"> UNIMPLEMENTED </span> Returns the number of ranks with a given Fenix_Rank_ro...
int Fenix_Initialized(int *flag)
Sets flag to true if Fenix_Init has been called, else false.
Definition fenix.cpp:80
int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status)
Check a pre-recovery request without error.
Definition fenix.cpp:204
int Fenix_Finalize()
Clean up Fenix state. Each active rank must call Fenix_Finalize before exiting.
Definition fenix.cpp:85
int Fenix_get_role(MPI_Comm comm, int rank, int *role)
<span class="mlabel"> UNIMPLEMENTED </span> Returns the Fenix_Rank_role for a given rank
int Fenix_Process_detect_failures(int do_recovery)
Check for any failed ranks.
Definition fenix.cpp:219
Fenix_Rank_role
All possible roles returned by Fenix_Init.
Definition fenix.h:140
@ FENIX_ROLE_RECOVERED_RANK
This rank was a spare before the most recent failure, or was just spawned.
Definition fenix.h:144
@ FENIX_ROLE_INITIAL_RANK
No failures have occurred yet.
Definition fenix.h:142
@ FENIX_ROLE_SURVIVOR_RANK
This rank was not a spare before the most recent failure.
Definition fenix.h:146
Definition fenix_data_subset.h:71
<span class="mlabel"> UNIMPLEMENTED </span> As MPI_Request, but for Fenix asynchronous data recove...
Definition fenix.h:346