Line data Source code
1 : #ifndef BL_DISTRIBUTIONMAPPING_H
2 : #define BL_DISTRIBUTIONMAPPING_H
3 : #include <AMReX_Config.H>
4 :
5 : #include <AMReX.H>
6 : #include <AMReX_Array.H>
7 : #include <AMReX_Vector.H>
8 : #include <AMReX_Box.H>
9 : #include <AMReX_REAL.H>
10 : #include <AMReX_ParallelDescriptor.H>
11 :
12 : #include <map>
13 : #include <limits>
14 : #include <memory>
15 : #include <cstddef>
16 : #include <iosfwd>
17 :
18 : namespace amrex {
19 :
20 : class BoxArray;
21 : class MultiFab;
22 : template <typename T> class FabArray;
23 : template <typename T> class LayoutData;
24 : class FabArrayBase;
25 :
26 : /**
27 : * \brief Calculates the distribution of FABs to MPI processes.
28 : *
29 : * This class calculates the distribution of FABs to MPI processes in a
30 : * FabArray in a multi-processor environment. By distribution is meant what
31 : * MPI process in the multi-processor environment owns what FAB. Only the BoxArray
32 : * on which the FabArray is built is used in determining the distribution.
33 : * The three types of distributions supported are round-robin, knapsack, and SFC.
34 : * In the round-robin distribution FAB i is owned by CPU i%N where N is total
35 : * number of CPUs. In the knapsack distribution the FABs are partitioned
36 : * across CPUs such that the total volume of the Boxes in the underlying
37 : * BoxArray are as equal across CPUs as is possible. The SFC distribution is
38 : * based on a space filling curve.
39 : */
40 41733 : class DistributionMapping
41 : {
42 : public:
43 :
44 : template <typename T> friend class FabArray;
45 : friend class FabArrayBase;
46 :
47 : //! The distribution strategies
48 : enum Strategy { UNDEFINED = -1, ROUNDROBIN, KNAPSACK, SFC, RRSFC };
49 :
50 : //! The default constructor.
51 : DistributionMapping () noexcept;
52 :
53 : //! The copy constructor.
54 311 : DistributionMapping (const DistributionMapping& rhs) = default;
55 :
56 : //! The copy assignment operator.
57 : DistributionMapping& operator= (const DistributionMapping& rhs) = default;
58 :
59 : //! The move constructor.
60 18 : DistributionMapping (DistributionMapping&& rhs) noexcept = default;
61 :
62 : //! The move assignment operator.
63 : DistributionMapping& operator= (DistributionMapping&& rhs) noexcept = default;
64 :
65 : //! The destructor.
66 1026170 : ~DistributionMapping() noexcept = default;
67 :
68 : /**
69 : * \brief Create an object with the specified mapping.
70 : */
71 : explicit DistributionMapping (const Vector<int>& pmap);
72 : explicit DistributionMapping (Vector<int>&& pmap) noexcept;
73 : //! Build mapping out of BoxArray over nprocs processors.
74 : explicit DistributionMapping (const BoxArray& boxes,
75 : int nprocs = ParallelDescriptor::NProcs());
76 : /**
77 : * \brief This is a very specialized distribution map.
78 : * Do NOT use it unless you really understand what it does.
79 : */
80 : DistributionMapping (const DistributionMapping& d1,
81 : const DistributionMapping& d2);
82 :
83 : /**
84 : * \brief Build mapping out of BoxArray over nprocs processors.
85 : * You need to call this if you built your DistributionMapping
86 : * with the default constructor.
87 : */
88 : void define (const BoxArray& boxes, int nprocs = ParallelDescriptor::NProcs());
89 : /**
90 : * \brief Build mapping out of an Array of ints. You need to call this if you
91 : * built your DistributionMapping with the default constructor.
92 : */
93 : void define (const Vector<int>& pmap);
94 : void define (Vector<int>&& pmap) noexcept;
95 : /**
96 : * \brief Returns a constant reference to the mapping of boxes in the
97 : * underlying BoxArray to the CPU that holds the FAB on that Box.
98 : * ProcessorMap()[i] is an integer in the interval [0, NCPU) where
99 : * NCPU is the number of CPUs being used.
100 : */
101 : [[nodiscard]] const Vector<int>& ProcessorMap () const noexcept;
102 :
103 : //! Length of the underlying processor map.
104 : [[nodiscard]] Long size () const noexcept { return Long(m_ref->m_pmap.size()); }
105 : [[nodiscard]] Long capacity () const noexcept { return Long(m_ref->m_pmap.capacity()); }
106 : [[nodiscard]] bool empty () const noexcept { return m_ref->m_pmap.empty(); }
107 :
108 : //! Number of references to this DistributionMapping
109 : [[nodiscard]] Long linkCount () const noexcept { return m_ref.use_count(); }
110 :
111 : //! Equivalent to ProcessorMap()[index].
112 : [[nodiscard]] int operator[] (int index) const noexcept { return m_ref->m_pmap[index]; }
113 :
114 : std::istream& readFrom (std::istream& is);
115 :
116 : std::ostream& writeOn (std::ostream& os) const;
117 :
118 : //! Set/get the distribution strategy.
119 : static void strategy (Strategy how);
120 :
121 : static Strategy strategy ();
122 :
123 : //! Set/get the space filling curve threshold.
124 : static void SFC_Threshold (int n);
125 :
126 : static int SFC_Threshold ();
127 :
128 : //! Are the distributions equal?
129 : bool operator== (const DistributionMapping& rhs) const noexcept;
130 :
131 : //! Are the distributions different?
132 : bool operator!= (const DistributionMapping& rhs) const noexcept;
133 :
134 : void SFCProcessorMap (const BoxArray& boxes, const std::vector<Long>& wgts, int nprocs,
135 : bool sort=true);
136 : void SFCProcessorMap (const BoxArray& boxes, const std::vector<Long>& wgts, int nprocs,
137 : Real& efficiency, bool sort=true);
138 : void KnapSackProcessorMap (const std::vector<Long>& wgts, int nprocs,
139 : Real* efficiency=nullptr,
140 : bool do_full_knapsack=true,
141 : int nmax=std::numeric_limits<int>::max(),
142 : bool sort=true);
143 : void KnapSackProcessorMap (const DistributionMapping& olddm,
144 : const std::vector<Long>& wgts, Real keep_ratio,
145 : Real& old_efficiency, Real& new_efficiency,
146 : int nmax=std::numeric_limits<int>::max());
147 : void RoundRobinProcessorMap (int nboxes, int nprocs, bool sort=true);
148 : void RoundRobinProcessorMap (const std::vector<Long>& wgts, int nprocs, bool sort=true);
149 :
150 : /**
151 : * \brief Initializes distribution strategy from ParmParse.
152 : *
153 : * ParmParse options are:
154 : *
155 : * DistributionMapping.strategy = ROUNDROBIN
156 : * DistributionMapping.strategy = KNAPSACK
157 : * DistributionMapping.strategy = SFC
158 : * DistributionMapping.strategy = RRFC
159 : */
160 : static void Initialize ();
161 :
162 : static void Finalize ();
163 :
164 : static bool SameRefs (const DistributionMapping& lhs,
165 : const DistributionMapping& rhs)
166 : { return lhs.m_ref == rhs.m_ref; }
167 :
168 : static DistributionMapping makeKnapSack (const MultiFab& weight,
169 : int nmax=std::numeric_limits<int>::max());
170 : static DistributionMapping makeKnapSack (const MultiFab& weight, Real& eff,
171 : int nmax=std::numeric_limits<int>::max());
172 : static DistributionMapping makeKnapSack (const Vector<Real>& rcost,
173 : int nmax=std::numeric_limits<int>::max());
174 : static DistributionMapping makeKnapSack (const Vector<Real>& rcost, Real& eff,
175 : int nmax=std::numeric_limits<int>::max(),
176 : bool sort=true);
177 :
178 : /** \brief Computes a new distribution mapping by distributing input costs
179 : * according to the `knapsack` algorithm.
180 : * @param[in] rcost_local LayoutData of costs; contains, e.g., costs for the
181 : * local boxes in the FAB array, corresponding indices in the global
182 : * indices in the FAB array, and the distribution mapping
183 : * @param[in,out] currentEfficiency writes the efficiency (i.e., mean cost over
184 : * all MPI ranks, normalized to the max cost) given the current
185 : * distribution mapping
186 : * @param[in,out] proposedEfficiency writes the efficiency for the proposed
187 : * distribution mapping
188 : * @param[in] nmax the maximum number of boxes that can be assigned to any
189 : * MPI rank by the knapsack algorithm
190 : * @param[in] broadcastToAll controls whether to transmit the proposed
191 : * distribution mapping to all other processes; setting this to
192 : * false allows to, e.g., test whether the proposed distribution
193 : * mapping is an improvement relative to the current distribution
194 : * mapping, before deciding to broadcast the proposed distribution
195 : * mapping
196 : * @param[in] root which process to collect the local costs from others and
197 : * compute the proposed distribution mapping
198 : * @param[in] keep_ratio controls the fraction of load that should be kept on
199 : * the original process.
200 : * @return the proposed load-balanced distribution mapping
201 : */
202 : static DistributionMapping makeKnapSack (const LayoutData<Real>& rcost_local,
203 : Real& currentEfficiency, Real& proposedEfficiency,
204 : int nmax=std::numeric_limits<int>::max(),
205 : bool broadcastToAll=true,
206 : int root=ParallelDescriptor::IOProcessorNumber(),
207 : Real keep_ratio = Real(0.0));
208 :
209 : static DistributionMapping makeRoundRobin (const MultiFab& weight);
210 : static DistributionMapping makeSFC (const MultiFab& weight, bool sort=true);
211 : static DistributionMapping makeSFC (const MultiFab& weight, Real& eff, bool sort=true);
212 : static DistributionMapping makeSFC (const Vector<Real>& rcost,
213 : const BoxArray& ba, bool sort=true);
214 : static DistributionMapping makeSFC (const Vector<Real>& rcost,
215 : const BoxArray& ba, Real& eff, bool sort=true);
216 :
217 : /** \brief Computes a new distribution mapping by distributing input costs
218 : * according to a `space filling curve` (SFC) algorithm.
219 : * @param[in] rcost_local LayoutData of costs; contains, e.g., costs for the
220 : * local boxes in the FAB array, corresponding indices in the global
221 : * indices in the FAB array, and the distribution mapping
222 : * @param[in,out] currentEfficiency writes the efficiency (i.e., mean cost over
223 : * all MPI ranks, normalized to the max cost) given the current
224 : * distribution mapping
225 : * @param[in,out] proposedEfficiency writes the efficiency for the proposed
226 : * distribution mapping
227 : * @param[in] broadcastToAll controls whether to transmit the proposed
228 : * distribution mapping to all other processes; setting this to
229 : * false allows to, e.g., test whether the proposed distribution
230 : * mapping is an improvement relative to the current distribution
231 : * mapping, before deciding to broadcast the proposed distribution
232 : * mapping
233 : * @param[in] root which process to collect the local costs from others and
234 : * compute the proposed distribution mapping
235 : * @return the proposed load-balanced distribution mapping
236 : */
237 : static DistributionMapping makeSFC (const LayoutData<Real>& rcost_local,
238 : Real& currentEfficiency, Real& proposedEfficiency,
239 : bool broadcastToAll=true,
240 : int root=ParallelDescriptor::IOProcessorNumber());
241 :
242 : /**
243 : * if use_box_vol is true, weight boxes by their volume in Distribute
244 : * otherwise, all boxes will be treated with equal weight
245 : */
246 : static std::vector<std::vector<int> > makeSFC (const BoxArray& ba,
247 : bool use_box_vol=true,
248 : int nprocs=ParallelContext::NProcsSub() );
249 :
250 : /** \brief Computes the average cost per MPI rank given a distribution mapping
251 : * global cost vector.
252 : * @param[in] dm distribution mapping (mapping from FAB to MPI processes)
253 : * @param[in] cost vector giving mapping from FAB to the corresponding cost
254 : * @param[in,out] efficiency average cost per MPI process, as computed from
255 : * the given distribution mapping and cost
256 : */
257 : template <typename T>
258 : static void ComputeDistributionMappingEfficiency (const DistributionMapping& dm,
259 : const std::vector<T>& cost,
260 : Real* efficiency);
261 :
262 : private:
263 :
264 : const Vector<int>& getIndexArray ();
265 : const std::vector<bool>& getOwnerShip ();
266 :
267 : //! Ways to create the processor map.
268 : void RoundRobinProcessorMap (const BoxArray& boxes, int nprocs);
269 : void KnapSackProcessorMap (const BoxArray& boxes, int nprocs);
270 : void SFCProcessorMap (const BoxArray& boxes, int nprocs);
271 : void RRSFCProcessorMap (const BoxArray& boxes, int nprocs);
272 :
273 : using LIpair = std::pair<Long,int>;
274 :
275 : struct LIpairLT
276 : {
277 : bool operator () (const LIpair& lhs,
278 : const LIpair& rhs) const noexcept
279 : {
280 : return lhs.first < rhs.first;
281 : }
282 : };
283 :
284 : struct LIpairGT
285 : {
286 : bool operator () (const LIpair& lhs,
287 : const LIpair& rhs) const noexcept
288 : {
289 : return lhs.first > rhs.first;
290 : }
291 : };
292 :
293 : static void Sort (std::vector<LIpair>& vec, bool reverse);
294 :
295 : void RoundRobinDoIt (int nboxes,
296 : int nprocs,
297 : std::vector<LIpair>* LIpairV = nullptr,
298 : bool sort = true);
299 :
300 : void KnapSackDoIt (const std::vector<Long>& wgts,
301 : int nprocs,
302 : Real& efficiency,
303 : bool do_full_knapsack,
304 : int nmax=std::numeric_limits<int>::max(),
305 : bool sort=true);
306 :
307 : void SFCProcessorMapDoIt (const BoxArray& boxes,
308 : const std::vector<Long>& wgts,
309 : int nprocs,
310 : bool sort=true,
311 : Real* efficiency=nullptr);
312 :
313 : void RRSFCDoIt (const BoxArray& boxes,
314 : int nprocs);
315 :
316 : //! Least used ordering of CPUs (by # of bytes of FAB data).
317 : static void LeastUsedCPUs (int nprocs, Vector<int>& result);
318 : /**
319 : * \brief rteam: Least used ordering of Teams
320 : * rworker[i]: Least used ordering of team workers for Team i
321 : */
322 : static void LeastUsedTeams (Vector<int>& rteam, Vector<Vector<int> >& rworker, int nteams, int nworkers);
323 :
324 : //! A useful typedef.
325 : using PVMF = void (DistributionMapping::*)(const BoxArray &, int);
326 :
327 : //! Everyone uses the same Strategy -- defaults to SFC.
328 : static Strategy m_Strategy;
329 : /**
330 : * \brief Pointer to one of the CreateProcessorMap() functions.
331 : * Corresponds to the one specified by m_Strategy.
332 : */
333 : static PVMF m_BuildMap;
334 :
335 : struct Ref
336 : {
337 : friend class DistributionMapping;
338 :
339 : //! Constructors to match those in DistributionMapping ....
340 : Ref () = default;
341 :
342 : explicit Ref (int len) : m_pmap(len) {}
343 :
344 : explicit Ref (const Vector<int>& pmap) : m_pmap(pmap) {}
345 :
346 : explicit Ref (Vector<int>&& pmap) noexcept : m_pmap(std::move(pmap)) {}
347 :
348 : //! dtor, copy-ctor, copy-op=, move-ctor, and move-op= are compiler generated.
349 :
350 : void clear () { m_pmap.clear(); m_index_array.clear(); m_ownership.clear(); }
351 :
352 : Vector<int> m_pmap; //!< index array for all boxes
353 : Vector<int> m_index_array; //!< index array for local boxes owned by the team
354 : std::vector<bool> m_ownership; //!< true ownership
355 : };
356 : //
357 : //! The data -- a reference-counted pointer to a Ref.
358 : std::shared_ptr<Ref> m_ref;
359 :
360 : public:
361 : struct RefID {
362 886192 : constexpr RefID () noexcept {} // =default does not work due to a clang bug // NOLINT
363 0 : explicit RefID (Ref* data_) noexcept : data(data_) {}
364 : bool operator< (const RefID& rhs) const noexcept { return std::less<>{}(data,rhs.data); }
365 : bool operator== (const RefID& rhs) const noexcept { return data == rhs.data; }
366 0 : bool operator!= (const RefID& rhs) const noexcept { return data != rhs.data; }
367 : [[nodiscard]] const Ref *dataPtr() const noexcept { return data; }
368 : void PrintPtr(std::ostream &os) const { os << data << '\n'; }
369 : friend std::ostream& operator<< (std::ostream& os, const RefID& id);
370 : private:
371 : Ref* data = nullptr;
372 : };
373 :
374 : //! This gives a unique ID of the reference, which is different from dmID above.
375 0 : [[nodiscard]] RefID getRefID () const noexcept { return RefID { m_ref.get() }; }
376 : };
377 :
378 : //! Our output operator.
379 : std::ostream& operator<< (std::ostream& os, const DistributionMapping& pmap);
380 :
381 : std::ostream& operator<< (std::ostream& os, const DistributionMapping::RefID& id);
382 :
383 : /**
384 : * \brief Function that creates a DistributionMapping "similar" to that of a MultiFab.
385 : *
386 : * "Similar" means that, if a box in "ba" intersects with any of the
387 : * boxes in the BoxArray associated with "mf", taking "ngrow" ghost cells into account,
388 : * then that box will be assigned to the proc owning the one it has the maximum amount
389 : * of overlap with.
390 : *
391 : * @param[in] ba The BoxArray we want to generate a DistributionMapping for.
392 : * @param[in] mf The MultiFab we want said DistributionMapping to be similar to.
393 : * @param[in] ng The number of grow cells to use when computing intersection / overlap
394 : * @return The computed DistributionMapping.
395 : */
396 : DistributionMapping MakeSimilarDM (const BoxArray& ba, const MultiFab& mf, const IntVect& ng);
397 :
398 : /**
399 : * \brief Function that creates a DistributionMapping "similar" to that of a MultiFab.
400 : *
401 : * "Similar" means that, if a box in "ba" intersects with any of the
402 : * boxes in the BoxArray associated with "mf", taking "ngrow" ghost cells into account,
403 : * then that box will be assigned to the proc owning the one it has the maximum amount
404 : * of overlap with.
405 : *
406 : * @param[in] ba The BoxArray we want to generate a DistributionMapping for.
407 : * @param[in] src_ba The BoxArray associated with the src DistributionMapping.
408 : * @param[in] src_dm The input DistributionMapping we want the output to be similar to.
409 : * @param[in] ng The number of grow cells to use when computing intersection / overlap
410 : * @return The computed DistributionMapping.
411 : */
412 : DistributionMapping MakeSimilarDM (const BoxArray& ba, const BoxArray& src_ba,
413 : const DistributionMapping& src_dm, const IntVect& ng);
414 :
415 : template <typename T>
416 : void DistributionMapping::ComputeDistributionMappingEfficiency (
417 : const DistributionMapping& dm, const std::vector<T>& cost, Real* efficiency)
418 : {
419 : const int nprocs = ParallelDescriptor::NProcs();
420 : Vector<T> wgts(nprocs, T(0));
421 :
422 : const auto nboxes = int(dm.size());
423 : for (int ibox = 0; ibox < nboxes; ++ibox) {
424 : wgts[dm[ibox]] += cost[ibox];
425 : }
426 :
427 : T max_weight = 0;
428 : T sum_weight = 0;
429 : for (auto const& w : wgts) {
430 : max_weight = std::max(w, max_weight);
431 : sum_weight += w;
432 : }
433 :
434 : *efficiency = static_cast<Real>(sum_weight) /
435 : (static_cast<Real>(nprocs) * static_cast<Real>(max_weight));
436 : }
437 :
438 : }
439 :
440 : #endif /*BL_DISTRIBUTIONMAPPING_H*/
|