Olivier Deprez | f4ef2d0 | 2021-04-20 13:36:24 +0200 | [diff] [blame] | 1 | //====--- OMPGridValues.h - Language-specific address spaces --*- C++ -*-====// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// \brief Provides definitions for Target specific Grid Values |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #ifndef LLVM_OPENMP_GRIDVALUES_H |
| 15 | #define LLVM_OPENMP_GRIDVALUES_H |
| 16 | |
| 17 | namespace llvm { |
| 18 | |
| 19 | namespace omp { |
| 20 | |
| 21 | /// \brief Defines various target-specific GPU grid values that must be |
| 22 | /// consistent between host RTL (plugin), device RTL, and clang. |
| 23 | /// We can change grid values for a "fat" binary so that different |
| 24 | /// passes get the correct values when generating code for a |
| 25 | /// multi-target binary. Both amdgcn and nvptx values are stored in |
| 26 | /// this file. In the future, should there be differences between GPUs |
| 27 | /// of the same architecture, then simply make a different array and |
| 28 | /// use the new array name. |
| 29 | /// |
| 30 | /// Example usage in clang: |
| 31 | /// const unsigned slot_size = |
| 32 | /// ctx.GetTargetInfo().getGridValue(llvm::omp::GVIDX::GV_Warp_Size); |
| 33 | /// |
| 34 | /// Example usage in libomptarget/deviceRTLs: |
| 35 | /// #include "llvm/Frontend/OpenMP/OMPGridValues.h" |
| 36 | /// #ifdef __AMDGPU__ |
| 37 | /// #define GRIDVAL AMDGPUGpuGridValues |
| 38 | /// #else |
| 39 | /// #define GRIDVAL NVPTXGpuGridValues |
| 40 | /// #endif |
| 41 | /// ... Then use this reference for GV_Warp_Size in the deviceRTL source. |
| 42 | /// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size] |
| 43 | /// |
| 44 | /// Example usage in libomptarget hsa plugin: |
| 45 | /// #include "llvm/Frontend/OpenMP/OMPGridValues.h" |
| 46 | /// #define GRIDVAL AMDGPUGpuGridValues |
| 47 | /// ... Then use this reference to access GV_Warp_Size in the hsa plugin. |
| 48 | /// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size] |
| 49 | /// |
| 50 | /// Example usage in libomptarget cuda plugin: |
| 51 | /// #include "llvm/Frontend/OpenMP/OMPGridValues.h" |
| 52 | /// #define GRIDVAL NVPTXGpuGridValues |
| 53 | /// ... Then use this reference to access GV_Warp_Size in the cuda plugin. |
| 54 | /// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size] |
| 55 | /// |
| 56 | enum GVIDX { |
| 57 | /// The maximum number of workers in a kernel. |
| 58 | /// (THREAD_ABSOLUTE_LIMIT) - (GV_Warp_Size), might be issue for blockDim.z |
| 59 | GV_Threads, |
| 60 | /// The size reserved for data in a shared memory slot. |
| 61 | GV_Slot_Size, |
| 62 | /// The default value of maximum number of threads in a worker warp. |
| 63 | GV_Warp_Size, |
| 64 | /// Alternate warp size for some AMDGCN architectures. Same as GV_Warp_Size |
| 65 | /// for NVPTX. |
| 66 | GV_Warp_Size_32, |
| 67 | /// The number of bits required to represent the max number of threads in warp |
| 68 | GV_Warp_Size_Log2, |
| 69 | /// GV_Warp_Size * GV_Slot_Size, |
| 70 | GV_Warp_Slot_Size, |
| 71 | /// the maximum number of teams. |
| 72 | GV_Max_Teams, |
| 73 | /// Global Memory Alignment |
| 74 | GV_Mem_Align, |
| 75 | /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2)) |
| 76 | GV_Warp_Size_Log2_Mask, |
| 77 | // An alternative to the heavy data sharing infrastructure that uses global |
| 78 | // memory is one that uses device __shared__ memory. The amount of such space |
| 79 | // (in bytes) reserved by the OpenMP runtime is noted here. |
| 80 | GV_SimpleBufferSize, |
| 81 | // The absolute maximum team size for a working group |
| 82 | GV_Max_WG_Size, |
| 83 | // The default maximum team size for a working group |
| 84 | GV_Default_WG_Size, |
| 85 | // This is GV_Max_WG_Size / GV_WarpSize. 32 for NVPTX and 16 for AMDGCN. |
| 86 | GV_Max_Warp_Number, |
| 87 | /// The slot size that should be reserved for a working warp. |
| 88 | /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2)) |
| 89 | GV_Warp_Size_Log2_MaskL |
| 90 | }; |
| 91 | |
| 92 | /// For AMDGPU GPUs |
| 93 | static constexpr unsigned AMDGPUGpuGridValues[] = { |
| 94 | 448, // GV_Threads |
| 95 | 256, // GV_Slot_Size |
| 96 | 64, // GV_Warp_Size |
| 97 | 32, // GV_Warp_Size_32 |
| 98 | 6, // GV_Warp_Size_Log2 |
| 99 | 64 * 256, // GV_Warp_Slot_Size |
| 100 | 128, // GV_Max_Teams |
| 101 | 256, // GV_Mem_Align |
| 102 | 63, // GV_Warp_Size_Log2_Mask |
| 103 | 896, // GV_SimpleBufferSize |
| 104 | 1024, // GV_Max_WG_Size, |
| 105 | 256, // GV_Defaut_WG_Size |
| 106 | 1024 / 64, // GV_Max_WG_Size / GV_WarpSize |
| 107 | 63 // GV_Warp_Size_Log2_MaskL |
| 108 | }; |
| 109 | |
| 110 | /// For Nvidia GPUs |
| 111 | static constexpr unsigned NVPTXGpuGridValues[] = { |
| 112 | 992, // GV_Threads |
| 113 | 256, // GV_Slot_Size |
| 114 | 32, // GV_Warp_Size |
| 115 | 32, // GV_Warp_Size_32 |
| 116 | 5, // GV_Warp_Size_Log2 |
| 117 | 32 * 256, // GV_Warp_Slot_Size |
| 118 | 1024, // GV_Max_Teams |
| 119 | 256, // GV_Mem_Align |
| 120 | (~0u >> (32 - 5)), // GV_Warp_Size_Log2_Mask |
| 121 | 896, // GV_SimpleBufferSize |
| 122 | 1024, // GV_Max_WG_Size |
| 123 | 128, // GV_Defaut_WG_Size |
| 124 | 1024 / 32, // GV_Max_WG_Size / GV_WarpSize |
| 125 | 31 // GV_Warp_Size_Log2_MaskL |
| 126 | }; |
| 127 | |
| 128 | } // namespace omp |
| 129 | } // namespace llvm |
| 130 | |
| 131 | #endif // LLVM_OPENMP_GRIDVALUES_H |