LCOV - code coverage report
Current view: top level - ext/amrex/3d-coverage-g++-24.08/include - AMReX_GpuDevice.H (source / functions) Hit Total Coverage
Test: coverage_merged.info Lines: 0 4 0.0 %
Date: 2025-01-16 18:33:59 Functions: 0 2 0.0 %

          Line data    Source code
       1             : #ifndef AMREX_GPU_DEVICE_H_
       2             : #define AMREX_GPU_DEVICE_H_
       3             : #include <AMReX_Config.H>
       4             : 
       5             : #include <AMReX.H>
       6             : #include <AMReX_Extension.H>
       7             : #include <AMReX_Utility.H>
       8             : #include <AMReX_GpuTypes.H>
       9             : #include <AMReX_GpuError.H>
      10             : #include <AMReX_GpuControl.H>
      11             : #include <AMReX_OpenMP.H>
      12             : #include <AMReX_Vector.H>
      13             : 
      14             : #include <algorithm>
      15             : #include <array>
      16             : #include <cstdlib>
      17             : #include <memory>
      18             : 
      19             : #define AMREX_GPU_MAX_STREAMS 8
      20             : 
      21             : #ifdef AMREX_USE_GPU
      22             : namespace amrex {
      23             : #ifdef AMREX_USE_HIP
      24             : using gpuDeviceProp_t = hipDeviceProp_t;
      25             : #elif defined(AMREX_USE_CUDA)
      26             : using gpuDeviceProp_t = cudaDeviceProp;
      27             : #elif defined(AMREX_USE_SYCL)
      28             :     struct gpuDeviceProp_t {
      29             :         std::string name;
      30             :         std::string vendor; // SYCL only (inferred for CUDA and HIP)
      31             :         std::size_t totalGlobalMem;
      32             :         std::size_t sharedMemPerBlock;
      33             :         int multiProcessorCount;
      34             :         int maxThreadsPerMultiProcessor;
      35             :         int maxThreadsPerBlock;
      36             :         int maxThreadsDim[3];
      37             :         int maxGridSize[3];
      38             :         int warpSize;
      39             :         Long maxMemAllocSize; // SYCL only
      40             :         int managedMemory;
      41             :         int concurrentManagedAccess;
      42             :         int maxParameterSize;
      43             :     };
      44             : #endif
      45             : }
      46             : #endif
      47             : 
      48             : namespace amrex::Gpu {
      49             : 
      50             : class Device
      51             : {
      52             : 
      53             : public:
      54             : 
      55             :     static void Initialize ();
      56             :     static void Finalize ();
      57             : 
      58             : #if defined(AMREX_USE_GPU)
      59             :     static gpuStream_t gpuStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
      60             : #ifdef AMREX_USE_CUDA
      61             :     /** for backward compatibility */
      62             :     static cudaStream_t cudaStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
      63             : #endif
      64             : #ifdef AMREX_USE_SYCL
      65             :     static sycl::queue& streamQueue () noexcept { return *(gpu_stream[OpenMP::get_thread_num()].queue); }
      66             :     static sycl::queue& streamQueue (int i) noexcept { return *(gpu_stream_pool[i].queue); }
      67             : #endif
      68             : #endif
      69             : 
      70           0 :     static int numGpuStreams () noexcept {
      71           0 :         return inSingleStreamRegion() ? 1 : max_gpu_streams;
      72             :     }
      73             : 
      74             :     static void setStreamIndex (int idx) noexcept;
      75             :     static void resetStreamIndex () noexcept { setStreamIndex(0); }
      76             : 
      77             : #ifdef AMREX_USE_GPU
      78             :     static int streamIndex (gpuStream_t s = gpuStream()) noexcept;
      79             : 
      80             :     static gpuStream_t setStream (gpuStream_t s) noexcept;
      81             :     static gpuStream_t resetStream () noexcept;
      82             : #endif
      83             : 
      84             :     static int deviceId () noexcept;
      85             :     static int numDevicesUsed () noexcept; // Total number of device used
      86             :     static int numDevicePartners () noexcept; // Number of partners sharing my device
      87             : 
      88             :     /**
      89             :      * Halt execution of code until GPU has finished processing all previously requested
      90             :      * tasks.
      91             :      */
      92             :     static void synchronize () noexcept;
      93             : 
      94             :     /**
      95             :      * Halt execution of code until the current AMReX GPU stream has finished processing all
      96             :      * previously requested tasks.
      97             :      */
      98             :     static void streamSynchronize () noexcept;
      99             : 
     100             :     /**
     101             :      * Halt execution of code until all AMReX GPU streams have finished processing all
     102             :      * previously requested tasks.
     103             :      */
     104             :     static void streamSynchronizeAll () noexcept;
     105             : 
     106             : #if defined(__CUDACC__)
     107             :     /**  Generic graph selection. These should be called by users.  */
     108             :     static void startGraphRecording(bool first_iter, void* h_ptr, void* d_ptr, size_t sz);
     109             :     static cudaGraphExec_t stopGraphRecording(bool last_iter);
     110             : 
     111             :     /** Instantiate a created cudaGtaph */
     112             :     static cudaGraphExec_t instantiateGraph(cudaGraph_t graph);
     113             : 
     114             :     /** Execute an instantiated cudaGraphExec */
     115             :     static void executeGraph(const cudaGraphExec_t &graphExec, bool synch = true);
     116             : 
     117             : #endif
     118             : 
     119             :     static void mem_advise_set_preferred (void* p, std::size_t sz, int device);
     120             :     static void mem_advise_set_readonly (void* p, std::size_t sz);
     121             : 
     122             : #ifdef AMREX_USE_GPU
     123             :     static void setNumThreadsMin (int nx, int ny, int nz) noexcept;
     124             :     static void n_threads_and_blocks (const Long N, dim3& numBlocks, dim3& numThreads) noexcept;
     125             :     static void c_comps_threads_and_blocks (const int* lo, const int* hi, const int comps,
     126             :                                             dim3& numBlocks, dim3& numThreads) noexcept;
     127             :     static void c_threads_and_blocks (const int* lo, const int* hi, dim3& numBlocks, dim3& numThreads) noexcept;
     128             :     static void grid_stride_threads_and_blocks (dim3& numBlocks, dim3& numThreads) noexcept;
     129             : 
     130             :     static std::size_t totalGlobalMem () noexcept { return device_prop.totalGlobalMem; }
     131             :     static std::size_t sharedMemPerBlock () noexcept { return device_prop.sharedMemPerBlock; }
     132             :     static int numMultiProcessors () noexcept { return device_prop.multiProcessorCount; }
     133             :     static int maxThreadsPerMultiProcessor () noexcept { return device_prop.maxThreadsPerMultiProcessor; }
     134             :     static int maxThreadsPerBlock () noexcept { return device_prop.maxThreadsPerBlock; }
     135             :     static int maxThreadsPerBlock (int dir) noexcept { return device_prop.maxThreadsDim[dir]; }
     136             :     static int maxBlocksPerGrid (int dir) noexcept { return device_prop.maxGridSize[dir]; }
     137             :     static std::string deviceName () noexcept { return std::string(device_prop.name); }
     138             : #endif
     139             : 
     140             : #ifdef AMREX_USE_CUDA
     141             :     static int devicePropMajor () noexcept { return device_prop.major; }
     142             :     static int devicePropMinor () noexcept { return device_prop.minor; }
     143             : #endif
     144             : 
     145             :     static std::string deviceVendor() noexcept
     146             :     {
     147             : #if defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
     148             :         return std::string("AMD");
     149             : #elif defined(AMREX_USE_CUDA) || (defined(AMREX_USE_HIP) && defined(__HIP_PLATFORM_NVIDIA__))
     150             :         // Using HIP on NVIDIA GPUs isn't currently supported by AMReX
     151             :         return std::string("NVIDIA");
     152             : #elif defined(AMREX_USE_SYCL)
     153             :         return device_prop.vendor;
     154             : #else
     155             :         return std::string("Unknown");
     156             : #endif
     157             :     }
     158             : 
     159             :     static std::size_t freeMemAvailable ();
     160             :     static void profilerStart ();
     161             :     static void profilerStop ();
     162             : 
     163             : #ifdef AMREX_USE_GPU
     164             : 
     165             :     static int memoryPoolsSupported () noexcept { return memory_pools_supported; }
     166             : 
     167             : #if defined(AMREX_USE_HIP)
     168             :     static AMREX_EXPORT constexpr int warp_size = AMREX_AMDGCN_WAVEFRONT_SIZE;
     169             : #elif defined(AMREX_USE_SYCL)
     170             :     static AMREX_EXPORT constexpr int warp_size = AMREX_SYCL_SUB_GROUP_SIZE;
     171             : #else
     172             :     static AMREX_EXPORT constexpr int warp_size = AMREX_HIP_OR_CUDA(64,32);
     173             : #endif
     174             : 
     175             :     static unsigned int maxBlocksPerLaunch () noexcept { return max_blocks_per_launch; }
     176             : 
     177             : #ifdef AMREX_USE_SYCL
     178             :     static Long maxMemAllocSize () noexcept { return device_prop.maxMemAllocSize; }
     179             :     static sycl::context& syclContext () { return *sycl_context; }
     180             :     static sycl::device& syclDevice () { return *sycl_device; }
     181             : #endif
     182             : #endif
     183             : 
     184             : private:
     185             : 
     186             :     static void initialize_gpu ();
     187             : 
     188             :     static AMREX_EXPORT int device_id;
     189             :     static AMREX_EXPORT int num_devices_used;
     190             :     static AMREX_EXPORT int num_device_partners;
     191             :     static AMREX_EXPORT int verbose;
     192             :     static AMREX_EXPORT int max_gpu_streams;
     193             : 
     194             : #ifdef AMREX_USE_GPU
     195             :     static AMREX_EXPORT dim3 numThreadsMin;
     196             :     static AMREX_EXPORT dim3 numBlocksOverride, numThreadsOverride;
     197             : 
     198             :     static AMREX_EXPORT Vector<gpuStream_t> gpu_stream_pool; // The size of this is max_gpu_stream
     199             :     // The non-owning gpu_stream is used to store the current stream that will be used.
     200             :     // gpu_stream is a vector so that it's thread safe to write to it.
     201             :     static AMREX_EXPORT Vector<gpuStream_t> gpu_stream; // The size of this is omp_max_threads
     202             :     static AMREX_EXPORT gpuDeviceProp_t device_prop;
     203             :     static AMREX_EXPORT int memory_pools_supported;
     204             :     static AMREX_EXPORT unsigned int max_blocks_per_launch;
     205             : 
     206             : #ifdef AMREX_USE_SYCL
     207             :     static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
     208             :     static AMREX_EXPORT std::unique_ptr<sycl::device>  sycl_device;
     209             : #endif
     210             : #endif
     211             : };
     212             : 
     213             : // Put these in amrex::Gpu
     214             : 
     215             : #if defined(AMREX_USE_GPU)
     216             : inline gpuStream_t
     217             : gpuStream () noexcept
     218             : {
     219             :     return Device::gpuStream();
     220             : }
     221             : #endif
     222             : 
     223             : inline int
     224           0 : numGpuStreams () noexcept
     225             : {
     226           0 :     return Device::numGpuStreams();
     227             : }
     228             : 
     229             : inline void
     230             : synchronize () noexcept
     231             : {
     232             :     Device::synchronize();
     233             : }
     234             : 
     235             : inline void
     236             : streamSynchronize () noexcept
     237             : {
     238             :     Device::streamSynchronize();
     239             : }
     240             : 
     241             : inline void
     242             : streamSynchronizeAll () noexcept
     243             : {
     244             :     Device::streamSynchronizeAll();
     245             : }
     246             : 
     247             : #ifdef AMREX_USE_GPU
     248             : 
     249             : inline void
     250             : htod_memcpy_async (void* p_d, const void* p_h, const std::size_t sz) noexcept
     251             : {
     252             :     if (sz == 0) { return; }
     253             : #ifdef AMREX_USE_SYCL
     254             :     auto& q = Device::streamQueue();
     255             :     q.submit([&] (sycl::handler& h) { h.memcpy(p_d, p_h, sz); });
     256             : #else
     257             :     AMREX_HIP_OR_CUDA(
     258             :         AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d, p_h, sz, hipMemcpyHostToDevice, gpuStream()));,
     259             :         AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d, p_h, sz, cudaMemcpyHostToDevice, gpuStream())); )
     260             : #endif
     261             : }
     262             : 
     263             : inline void
     264             : dtoh_memcpy_async (void* p_h, const void* p_d, const std::size_t sz) noexcept
     265             : {
     266             :     if (sz == 0) { return; }
     267             : #ifdef AMREX_USE_SYCL
     268             :     auto& q = Device::streamQueue();
     269             :     q.submit([&] (sycl::handler& h) { h.memcpy(p_h, p_d, sz); });
     270             : #else
     271             :     AMREX_HIP_OR_CUDA(
     272             :         AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_h, p_d, sz, hipMemcpyDeviceToHost, gpuStream()));,
     273             :         AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_h, p_d, sz, cudaMemcpyDeviceToHost, gpuStream())); )
     274             : #endif
     275             : }
     276             : 
     277             : inline void
     278             : dtod_memcpy_async (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
     279             : {
     280             :     if (sz == 0) { return; }
     281             : #ifdef AMREX_USE_SYCL
     282             :     auto& q = Device::streamQueue();
     283             :     q.submit([&] (sycl::handler& h) { h.memcpy(p_d_dst, p_d_src, sz); });
     284             : #else
     285             :     AMREX_HIP_OR_CUDA(
     286             :         AMREX_HIP_SAFE_CALL(hipMemcpyAsync(p_d_dst, p_d_src, sz, hipMemcpyDeviceToDevice, gpuStream()));,
     287             :         AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(p_d_dst, p_d_src, sz, cudaMemcpyDeviceToDevice, gpuStream())); )
     288             : #endif
     289             : }
     290             : 
     291             : inline void
     292             : htod_memcpy (void* p_d, const void* p_h, const std::size_t sz) noexcept
     293             : {
     294             :     if (sz == 0) { return; }
     295             :     htod_memcpy_async(p_d, p_h, sz);
     296             :     Gpu::streamSynchronize();
     297             : }
     298             : 
     299             : inline void
     300             : dtoh_memcpy (void* p_h, const void* p_d, const std::size_t sz) noexcept
     301             : {
     302             :     if (sz == 0) { return; }
     303             :     dtoh_memcpy_async(p_h, p_d, sz);
     304             :     Gpu::streamSynchronize();
     305             : }
     306             : 
     307             : inline void
     308             : dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
     309             : {
     310             :     if (sz == 0) { return; }
     311             :     dtod_memcpy_async(p_d_dst, p_d_src, sz);
     312             :     Gpu::streamSynchronize();
     313             : }
     314             : 
     315             : #endif
     316             : 
     317             : #ifdef AMREX_USE_HYPRE
     318             : void hypreSynchronize ();
     319             : #endif
     320             : 
     321             : }
     322             : 
     323             : #endif

Generated by: LCOV version 1.14