Compadre  1.5.7
Compadre_ApplyTargetEvaluations.hpp
Go to the documentation of this file.
1 #ifndef _COMPADRE_APPLY_TARGET_EVALUATIONS_HPP_
2 #define _COMPADRE_APPLY_TARGET_EVALUATIONS_HPP_
3 
4 #include "Compadre_GMLS.hpp"
5 namespace Compadre {
6 
7 /*! \brief For applying the evaluations from a target functional to the polynomial coefficients
8  \param data [out/in] - GMLSSolutionData struct (stores solution in data._d_ss._alphas)
9  \param teamMember [in] - Kokkos::TeamPolicy member type (created by parallel_for)
10  \param Q [in] - 2D Kokkos View containing the polynomial coefficients
11  \param P_target_row [in] - 1D Kokkos View where the evaluation of the polynomial basis is stored
12 */
13 template <typename SolutionData>
14 KOKKOS_INLINE_FUNCTION
15 void applyTargetsToCoefficients(const SolutionData& data, const member_type& teamMember, scratch_matrix_right_type Q, scratch_matrix_right_type P_target_row) {
16 
17  const int target_index = data._initial_index_for_batch + teamMember.league_rank();
18 
19 #if defined(COMPADRE_USE_CUDA)
20 // // GPU
21 // for (int j=0; j<_operations.size(); ++j) {
22 // for (int k=0; k<_lro_output_tile_size[j]; ++k) {
23 // for (int m=0; m<_lro_input_tile_size[j]; ++m) {
24 // const int alpha_offset = (_lro_total_offsets[j] + m*_lro_output_tile_size[j] + k)*_neighbor_lists(target_index,0);
25 // const int P_offset =_basis_multiplier*target_NP*(_lro_total_offsets[j] + m*_lro_output_tile_size[j] + k);
26 // Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,
27 // _pc._nla.getNumberOfNeighborsDevice(target_index)), [=] (const int i) {
28 //
29 // double alpha_ij = 0;
30 // if (_sampling_multiplier>1 && m<_sampling_multiplier) {
31 // const int m_neighbor_offset = i+m*_pc._nla.getNumberOfNeighborsDevice(target_index);
32 // Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember,
33 // _basis_multiplier*target_NP), [=] (const int l, double &talpha_ij) {
34 // //for (int l=0; l<_basis_multiplier*target_NP; ++l) {
35 // talpha_ij += P_target_row(P_offset + l)*Q(ORDER_INDICES(m_neighbor_offset,l));
36 // }, alpha_ij);
37 // //}
38 // } else if (_sampling_multiplier == 1) {
39 // Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember,
40 // _basis_multiplier*target_NP), [=] (const int l, double &talpha_ij) {
41 // //for (int l=0; l<_basis_multiplier*target_NP; ++l) {
42 // talpha_ij += P_target_row(P_offset + l)*Q(ORDER_INDICES(i,l));
43 // }, alpha_ij);
44 // //}
45 // }
46 // Kokkos::single(Kokkos::PerThread(teamMember), [&] () {
47 // t1(i) = alpha_ij;
48 // });
49 // });
50 // Kokkos::parallel_for(Kokkos::ThreadVectorRange(teamMember,
51 // _pc._nla.getNumberOfNeighborsDevice(target_index)), [=] (const int i) {
52 // _alphas(ORDER_INDICES(target_index, alpha_offset + i)) = t1(i);
53 // });
54 // teamMember.team_barrier();
55 // }
56 // }
57 // }
58 
59  // GPU
60  const auto n_evaluation_sites_per_target = data.additional_number_of_neighbors_list(target_index) + 1;
61  const auto nn = data.number_of_neighbors_list(target_index);
62  auto alphas = data._d_ss._alphas;
63  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,
64  nn + data._d_ss._added_alpha_size), [&] (const int i) {
65  for (int e=0; e<n_evaluation_sites_per_target; ++e) {
66  for (int j=0; j<(int)data.operations_size; ++j) {
67  for (int k=0; k<data._d_ss._lro_output_tile_size[j]; ++k) {
68  for (int m=0; m<data._d_ss._lro_input_tile_size[j]; ++m) {
69  const int offset_index_jmke = data._d_ss.getTargetOffsetIndex(j,m,k,e);
70  const int alphas_index = data._d_ss.getAlphaIndex(target_index, offset_index_jmke);
71  double alpha_ij = 0;
72  if (data._sampling_multiplier>1 && m<data._sampling_multiplier) {
73  const int m_neighbor_offset = i+m*nn;
74  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, data.this_num_cols),
75  [&] (int& l, double& t_alpha_ij) {
76  t_alpha_ij += P_target_row(offset_index_jmke, l)*Q(l, m_neighbor_offset);
77 
78  compadre_kernel_assert_extreme_debug(P_target_row(offset_index_jmke, l)==P_target_row(offset_index_jmke, l)
79  && "NaN in P_target_row matrix.");
80  compadre_kernel_assert_extreme_debug(Q(l, m_neighbor_offset)==Q(l, m_neighbor_offset)
81  && "NaN in Q coefficient matrix.");
82 
83  }, alpha_ij);
84  } else if (data._sampling_multiplier == 1) {
85  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, data.this_num_cols),
86  [&] (int& l, double& t_alpha_ij) {
87  t_alpha_ij += P_target_row(offset_index_jmke, l)*Q(l,i);
88 
89  compadre_kernel_assert_extreme_debug(P_target_row(offset_index_jmke, l)==P_target_row(offset_index_jmke, l)
90  && "NaN in P_target_row matrix.");
91  compadre_kernel_assert_extreme_debug(Q(l,i)==Q(l,i)
92  && "NaN in Q coefficient matrix.");
93 
94  }, alpha_ij);
95  }
96  // could use a PerThread here, but performance takes a hit
97  // and it isn't necessary
98  alphas(alphas_index+i) = alpha_ij;
99  compadre_kernel_assert_extreme_debug(alpha_ij==alpha_ij && "NaN in alphas.");
100 
101  }
102  }
103  }
104  }
105  });
106 #else
107 
108  // CPU
109  const int alphas_per_tile_per_target = data.number_of_neighbors_list(target_index) + data._d_ss._added_alpha_size;
110  const global_index_type base_offset_index_jmke = data._d_ss.getTargetOffsetIndex(0,0,0,0);
111  const global_index_type base_alphas_index = data._d_ss.getAlphaIndex(target_index, base_offset_index_jmke);
112 
113  scratch_matrix_right_type this_alphas(data._d_ss._alphas.data() + TO_GLOBAL(base_alphas_index), data._d_ss._total_alpha_values*data._d_ss._max_evaluation_sites_per_target, alphas_per_tile_per_target);
114 
115  auto n_evaluation_sites_per_target = data.additional_number_of_neighbors_list(target_index) + 1;
116  const auto nn = data.number_of_neighbors_list(target_index);
117  for (int e=0; e<n_evaluation_sites_per_target; ++e) {
118  // evaluating alpha_ij
119  for (size_t j=0; j<data.operations_size; ++j) {
120  for (int k=0; k<data._d_ss._lro_output_tile_size[j]; ++k) {
121  for (int m=0; m<data._d_ss._lro_input_tile_size[j]; ++m) {
122  const int offset_index_jmke = data._d_ss.getTargetOffsetIndex(j,m,k,e);
123  for (int i=0; i<nn + data._d_ss._added_alpha_size; ++i) {
124  double alpha_ij = 0;
125  const int Q_col = i+m*nn;
126  // one thread per team on CPU, so no reason to add inner parallel reductions here
127  // or Kokkos::single. Either will cause significant slowdown.
128  for (int l=0; l<data.this_num_cols; ++l) {
129  if (data._sampling_multiplier>1 && m<data._sampling_multiplier) {
130 
131  alpha_ij += P_target_row(offset_index_jmke, l)*Q(l, Q_col);
132 
133  compadre_kernel_assert_extreme_debug(P_target_row(offset_index_jmke, l)==P_target_row(offset_index_jmke, l)
134  && "NaN in P_target_row matrix.");
135  compadre_kernel_assert_extreme_debug(Q(l, i+m*data.number_of_neighbors_list(target_index))==Q(l, i+m*data.number_of_neighbors_list(target_index))
136  && "NaN in Q coefficient matrix.");
137 
138  } else if (data._sampling_multiplier == 1) {
139 
140  alpha_ij += P_target_row(offset_index_jmke, l)*Q(l, i);
141 
142  compadre_kernel_assert_extreme_debug(P_target_row(offset_index_jmke, l)==P_target_row(offset_index_jmke, l)
143  && "NaN in P_target_row matrix.");
145  && "NaN in Q coefficient matrix.");
146 
147  } else {
148  alpha_ij += 0;
149  }
150  }
151  this_alphas(offset_index_jmke,i) = alpha_ij;
152  }
153  }
154  }
155  }
156  }
157 #endif
158 
159  teamMember.team_barrier();
160 }
161 
162 } // Compadre
163 #endif
std::size_t global_index_type
team_policy::member_type member_type
#define TO_GLOBAL(variable)
#define compadre_kernel_assert_extreme_debug(condition)
Kokkos::View< double **, layout_right, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_matrix_right_type
KOKKOS_INLINE_FUNCTION void applyTargetsToCoefficients(const SolutionData &data, const member_type &teamMember, scratch_matrix_right_type Q, scratch_matrix_right_type P_target_row)
For applying the evaluations from a target functional to the polynomial coefficients.