Line data Source code
1 : #ifndef AMREX_GPU_DEVICE_H_
2 : #define AMREX_GPU_DEVICE_H_
3 : #include <AMReX_Config.H>
4 :
5 : #include <AMReX.H>
6 : #include <AMReX_Extension.H>
7 : #include <AMReX_Utility.H>
8 : #include <AMReX_GpuTypes.H>
9 : #include <AMReX_GpuError.H>
10 : #include <AMReX_GpuControl.H>
11 : #include <AMReX_OpenMP.H>
12 : #include <AMReX_Vector.H>
13 :
14 : #include <algorithm>
15 : #include <array>
16 : #include <cstdlib>
17 : #include <memory>
18 :
19 : #define AMREX_GPU_MAX_STREAMS 8
20 :
21 : #ifdef AMREX_USE_GPU
22 : namespace amrex {
23 : #ifdef AMREX_USE_HIP
24 : using gpuDeviceProp_t = hipDeviceProp_t;
25 : #elif defined(AMREX_USE_CUDA)
26 : using gpuDeviceProp_t = cudaDeviceProp;
27 : #elif defined(AMREX_USE_SYCL)
28 : struct gpuDeviceProp_t {
29 : std::string name;
30 : std::string vendor; // SYCL only (inferred for CUDA and HIP)
31 : std::size_t totalGlobalMem;
32 : std::size_t sharedMemPerBlock;
33 : int multiProcessorCount;
34 : int maxThreadsPerMultiProcessor;
35 : int maxThreadsPerBlock;
36 : int maxThreadsDim[3];
37 : int maxGridSize[3];
38 : int warpSize;
39 : Long maxMemAllocSize; // SYCL only
40 : int managedMemory;
41 : int concurrentManagedAccess;
42 : int maxParameterSize;
43 : };
44 : #endif
45 : }
46 : #endif
47 :
48 : namespace amrex::Gpu {
49 :
50 : class Device
51 : {
52 :
53 : public:
54 :
55 : static void Initialize ();
56 : static void Finalize ();
57 :
58 : #if defined(AMREX_USE_GPU)
59 : static gpuStream_t gpuStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
60 : #ifdef AMREX_USE_CUDA
61 : /** for backward compatibility */
62 : static cudaStream_t cudaStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
63 : #endif
64 : #ifdef AMREX_USE_SYCL
65 : static sycl::queue& streamQueue () noexcept { return *(gpu_stream[OpenMP::get_thread_num()].queue); }
66 : static sycl::queue& streamQueue (int i) noexcept { return *(gpu_stream_pool[i].queue); }
67 : #endif
68 : #endif
69 :
70 0 : static int numGpuStreams () noexcept {
71 0 : return inSingleStreamRegion() ? 1 : max_gpu_streams;
72 : }
73 :
74 : static void setStreamIndex (int idx) noexcept;
75 : static void resetStreamIndex () noexcept { setStreamIndex(0); }
76 :
77 : #ifdef AMREX_USE_GPU
78 : static int streamIndex (gpuStream_t s = gpuStream()) noexcept;
79 :
80 : static gpuStream_t setStream (gpuStream_t s) noexcept;
81 : static gpuStream_t resetStream () noexcept;
82 : #endif
83 :
84 : static int deviceId () noexcept;
85 : static int numDevicesUsed () noexcept; // Total number of device used
86 : static int numDevicePartners () noexcept; // Number of partners sharing my device
87 :
88 : /**
89 : * Halt execution of code until GPU has finished processing all previously requested
90 : * tasks.
91 : */
92 : static void synchronize () noexcept;
93 :
94 : /**
95 : * Halt execution of code until the current AMReX GPU stream has finished processing all
96 : * previously requested tasks.
97 : */
98 : static void streamSynchronize () noexcept;
99 :
100 : /**
101 : * Halt execution of code until all AMReX GPU streams have finished processing all
102 : * previously requested tasks.
103 : */
104 : static void streamSynchronizeAll () noexcept;
105 :
106 : #if defined(__CUDACC__)
107 : /** Generic graph selection. These should be called by users. */
108 : static void startGraphRecording(bool first_iter, void* h_ptr, void* d_ptr, size_t sz);
109 : static cudaGraphExec_t stopGraphRecording(bool last_iter);
110 :
111 : /** Instantiate a created cudaGtaph */
112 : static cudaGraphExec_t instantiateGraph(cudaGraph_t graph);
113 :
114 : /** Execute an instantiated cudaGraphExec */
115 : static void executeGraph(const cudaGraphExec_t &graphExec, bool synch = true);
116 :
117 : #endif
118 :
119 : static void mem_advise_set_preferred (void* p, std::size_t sz, int device);
120 : static void mem_advise_set_readonly (void* p, std::size_t sz);
121 :
122 : #ifdef AMREX_USE_GPU
123 : static void setNumThreadsMin (int nx, int ny, int nz) noexcept;
124 : static void n_threads_and_blocks (const Long N, dim3& numBlocks, dim3& numThreads) noexcept;
125 : static void c_comps_threads_and_blocks (const int* lo, const int* hi, const int comps,
126 : dim3& numBlocks, dim3& numThreads) noexcept;
127 : static void c_threads_and_blocks (const int* lo, const int* hi, dim3& numBlocks, dim3& numThreads) noexcept;
128 : static void grid_stride_threads_and_blocks (dim3& numBlocks, dim3& numThreads) noexcept;
129 :
130 : static std::size_t totalGlobalMem () noexcept { return device_prop.totalGlobalMem; }
131 : static std::size_t sharedMemPerBlock () noexcept { return device_prop.sharedMemPerBlock; }
132 : static int numMultiProcessors () noexcept { return device_prop.multiProcessorCount; }
133 : static int maxThreadsPerMultiProcessor () noexcept { return device_prop.maxThreadsPerMultiProcessor; }
134 : static int maxThreadsPerBlock () noexcept { return device_prop.maxThreadsPerBlock; }
135 : static int maxThreadsPerBlock (int dir) noexcept { return device_prop.maxThreadsDim[dir]; }
136 : static int maxBlocksPerGrid (int dir) noexcept { return device_prop.maxGridSize[dir]; }
137 : static std::string deviceName () noexcept { return std::string(device_prop.name); }
138 : #endif
139 :
140 : #ifdef AMREX_USE_CUDA
141 : static int devicePropMajor () noexcept { return device_prop.major; }
142 : static int devicePropMinor () noexcept { return device_prop.minor; }
143 : #endif
144 :
145 : static std::string deviceVendor() noexcept
146 : {
147 : #if defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
148 : return std::string("AMD");
149 : #elif defined(AMREX_USE_CUDA) || (defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_NVIDIA__))
150 : // Using HIP on NVIDIA GPUs isn't currently supported by AMReX
151 : return std::string("NVIDIA");
152 : #elif defined(AMREX_USE_SYCL)
153 : return device_prop.vendor;
154 : #else
155 : return std::string("Unknown");
156 : #endif
157 : }
158 :
159 : static std::size_t freeMemAvailable ();
160 : static void profilerStart ();
161 : static void profilerStop ();
162 :
163 : #ifdef AMREX_USE_GPU
164 :
165 : static int memoryPoolsSupported () noexcept { return memory_pools_supported; }
166 :
167 : #if defined(AMREX_USE_HIP)
168 : static AMREX_EXPORT constexpr int warp_size = AMREX_AMDGCN_WAVEFRONT_SIZE;
169 : #elif defined(AMREX_USE_SYCL)
170 : static AMREX_EXPORT constexpr int warp_size = AMREX_SYCL_SUB_GROUP_SIZE;
171 : #else
172 : static AMREX_EXPORT constexpr int warp_size = AMREX_HIP_OR_CUDA(64,32);
173 : #endif
174 :
175 : static unsigned int maxBlocksPerLaunch () noexcept { return max_blocks_per_launch; }
176 :
177 : #ifdef AMREX_USE_SYCL
178 : static Long maxMemAllocSize () noexcept { return device_prop.maxMemAllocSize; }
179 : static sycl::context& syclContext () { return *sycl_context; }
180 : static sycl::device& syclDevice () { return *sycl_device; }
181 : #endif
182 : #endif
183 :
184 : private:
185 :
186 : static void initialize_gpu ();
187 :
188 : static AMREX_EXPORT int device_id;
189 : static AMREX_EXPORT int num_devices_used;
190 : static AMREX_EXPORT int num_device_partners;
191 : static AMREX_EXPORT int verbose;
192 : static AMREX_EXPORT int max_gpu_streams;
193 :
194 : #ifdef AMREX_USE_GPU
195 : static AMREX_EXPORT dim3 numThreadsMin;
196 : static AMREX_EXPORT dim3 numBlocksOverride, numThreadsOverride;
197 :
198 : static AMREX_EXPORT Vector<gpuStream_t> gpu_stream_pool; // The size of this is max_gpu_stream
199 : // The non-owning gpu_stream is used to store the current stream that will be used.
200 : // gpu_stream is a vector so that it's thread safe to write to it.
201 : static AMREX_EXPORT Vector<gpuStream_t> gpu_stream; // The size of this is omp_max_threads
202 : static AMREX_EXPORT gpuDeviceProp_t device_prop;
203 : static AMREX_EXPORT int memory_pools_supported;
204 : static AMREX_EXPORT unsigned int max_blocks_per_launch;
205 :
206 : #ifdef AMREX_USE_SYCL
207 : static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
208 : static AMREX_EXPORT std::unique_ptr<sycl::device> sycl_device;
209 : #endif
210 : #endif
211 : };
212 :
213 : // Put these in amrex::Gpu
214 :
215 : #if defined(AMREX_USE_GPU)
216 : inline gpuStream_t
217 : gpuStream () noexcept
218 : {
219 : return Device::gpuStream();
220 : }
221 : #endif
222 :
223 : inline int
224 0 : numGpuStreams () noexcept
225 : {
226 0 : return Device::numGpuStreams();
227 : }
228 :
229 : inline void
230 : synchronize () noexcept
231 : {
232 : Device::synchronize();
233 : }
234 :
235 : inline void
236 : streamSynchronize () noexcept
237 : {
238 : Device::streamSynchronize();
239 : }
240 :
241 : inline void
242 : streamSynchronizeAll () noexcept
243 : {
244 : Device::streamSynchronizeAll();
245 : }
246 :
247 : #ifdef AMREX_USE_GPU
248 :
249 : inline void
250 : htod_memcpy_async (void* p_d, const void* p_h, const std::size_t sz) noexcept
251 : {
252 : if (sz == 0) { return; }
253 : #ifdef AMREX_USE_SYCL
254 : auto& q = Device::streamQueue();
255 : q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
256 : #else
257 : AMREX_HIP_OR_CUDA(
258 : AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice, gpuStream()));,
259 : AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d, p_h, sz, cudaMemcpyHostToDevice, gpuStream())); )
260 : #endif
261 : }
262 :
263 : inline void
264 : dtoh_memcpy_async (void* p_h, const void* p_d, const std::size_t sz) noexcept
265 : {
266 : if (sz == 0) { return; }
267 : #ifdef AMREX_USE_SYCL
268 : auto& q = Device::streamQueue();
269 : q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
270 : #else
271 : AMREX_HIP_OR_CUDA(
272 : AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_h, p_d, sz, hipMemcpyDeviceToHost, gpuStream()));,
273 : AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_h, p_d, sz, cudaMemcpyDeviceToHost, gpuStream())); )
274 : #endif
275 : }
276 :
277 : inline void
278 : dtod_memcpy_async (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
279 : {
280 : if (sz == 0) { return; }
281 : #ifdef AMREX_USE_SYCL
282 : auto& q = Device::streamQueue();
283 : q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
284 : #else
285 : AMREX_HIP_OR_CUDA(
286 : AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d_dst, p_d_src, sz, hipMemcpyDeviceToDevice, gpuStream()));,
287 : AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d_dst, p_d_src, sz, cudaMemcpyDeviceToDevice, gpuStream())); )
288 : #endif
289 : }
290 :
291 : inline void
292 : htod_memcpy (void* p_d, const void* p_h, const std::size_t sz) noexcept
293 : {
294 : if (sz == 0) { return; }
295 : htod_memcpy_async(p_d, p_h, sz);
296 : Gpu::streamSynchronize();
297 : }
298 :
299 : inline void
300 : dtoh_memcpy (void* p_h, const void* p_d, const std::size_t sz) noexcept
301 : {
302 : if (sz == 0) { return; }
303 : dtoh_memcpy_async(p_h, p_d, sz);
304 : Gpu::streamSynchronize();
305 : }
306 :
307 : inline void
308 : dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
309 : {
310 : if (sz == 0) { return; }
311 : dtod_memcpy_async(p_d_dst, p_d_src, sz);
312 : Gpu::streamSynchronize();
313 : }
314 :
315 : #endif
316 :
317 : #ifdef AMREX_USE_HYPRE
318 : void hypreSynchronize ();
319 : #endif
320 :
321 : }
322 :
323 : #endif
|