From d7c64160754cccd8dc11092a865f78d5432e50fd Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sat, 30 Sep 2023 10:05:28 -0500
Subject: [PATCH 01/30] lit: 15.0.6 -> 17.0.1

---
 pkgs/development/python-modules/lit/default.nix | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkgs/development/python-modules/lit/default.nix b/pkgs/development/python-modules/lit/default.nix
index b4dee1e20b55..1467dfaa1f6d 100644
--- a/pkgs/development/python-modules/lit/default.nix
+++ b/pkgs/development/python-modules/lit/default.nix
@@ -6,11 +6,11 @@
 
 buildPythonPackage rec {
   pname = "lit";
-  version = "15.0.6";
+  version = "17.0.1";
 
   src = fetchPypi {
     inherit pname version;
-    hash = "sha256-S06OQfDmDyutls21HxyQ016ku3FTTsDOP8Di67d9f+k=";
+    hash = "sha256-RIZ65Xa1eQVnsSC8Pw2fAh2slCTRsIQMdazYX0YQrAQ=";
   };
 
   passthru = {

From ccdfcd324e90f57954c6bd631b0bfbe3e03a5f99 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sat, 30 Sep 2023 21:40:13 -0500
Subject: [PATCH 02/30] spirv-llvm-translator: Add 17.0.0 release

---
 .../compilers/spirv-llvm-translator/default.nix        | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pkgs/development/compilers/spirv-llvm-translator/default.nix b/pkgs/development/compilers/spirv-llvm-translator/default.nix
index c5e8ee5a6214..c6c743ce1e59 100644
--- a/pkgs/development/compilers/spirv-llvm-translator/default.nix
+++ b/pkgs/development/compilers/spirv-llvm-translator/default.nix
@@ -13,13 +13,17 @@ let
   llvmMajor = lib.versions.major llvm.version;
   isROCm = lib.hasPrefix "rocm" llvm.pname;
 
-  # ROCm will always be at the latest version
+  # ROCm, if actively updated will always be at the latest version
   branch =
-    if llvmMajor == "16" then rec {
+    if llvmMajor == "17" || isROCm then rec {
+      version = "17.0.0";
+      rev = "v${version}";
+      hash = "sha256-Rzm5Py9IPFtS9G7kME+uSwZ/0gPGW6MlL35ZWk4LfHM=";
+    } else if llvmMajor == "16" then rec {
       version = "16.0.0";
       rev = "v${version}";
       hash = "sha256-EUabcYqSjXshbPmcs1DRLvCSL1nd9rEdpqELBrItCW8=";
-    } else if llvmMajor == "15" || isROCm then rec {
+    } else if llvmMajor == "15" then rec {
       version = "15.0.0";
       rev = "v${version}";
       hash = "sha256-OsDohXRxovtEXaWiRGp8gJ0dXmoALyO+ZimeSO8aPVI=";

From 37390cd11c878ac889ba8c27fe77f6d195737c9d Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sat, 30 Sep 2023 04:47:55 +0000
Subject: [PATCH 03/30] rocm-related: 5.4.X -> 5.7.0

hip-common: 5.4.2 -> 5.7.0

hipblas: 5.4.3 -> 5.7.0

hipcc: 5.4.2 -> 5.7.0

hipcub: 5.4.4 -> 5.7.0

hipfft: 5.4.3 -> 5.7.0

hipfort: 5.6.0 -> 5.7.0

hipify: 5.4.2 -> 5.7.0

hipsolver: 5.4.4 -> 5.7.0

hipsparse: 5.4.4 -> 5.7.0

llvmPackages_rocm.llvm: 5.4.4 -> 5.7.0

migraphx: 5.4.3 -> 5.7.0

clang-ocl: 5.4.2 -> 5.7.0

rccl: 5.4.3 -> 5.7.0

rdc: 5.4.2 -> 5.7.0

rocblas: 5.4.3 -> 5.7.0

rocdbgapi: 5.4.2 -> 5.7.0

rocfft: 5.4.3 -> 5.7.0

miopen: 5.4.2 -> 5.7.0

rocm-comgr: 5.4.4 -> 5.7.0

rocgdb: 5.4.2 -> 5.7.0

rocm-device-libs: 5.4.4 -> 5.7.0

rocalution: 5.4.3 -> 5.7.0

rocm-smi: 5.4.4 -> 5.7.0

rocm-runtime: 5.4.3 -> 5.7.0

rocminfo: 5.4.4 -> 5.7.0

rocm-thunk: 5.4.4 -> 5.7.0

rocprofiler: 5.4.3 -> 5.7.0

rocprim: 5.4.3 -> 5.7.0

rocrand: 5.4.3 -> 5.7.0

rocmlir: 5.4.1 -> 5.7.0

rocr-debug-agent: 5.4.2 -> 5.7.0

rocthrust: 5.4.3 -> 5.7.0

rocsparse: 5.4.3 -> 5.7.0

roctracer: 5.4.3 -> 5.7.0

rocsolver: 5.4.4 -> 5.7.0

tensile: 5.4.2 -> 5.7.0

rocwmma: 5.4.3 -> 5.7.0

rocm-cmake: 5.4.4 -> 5.7.0
---
 pkgs/development/compilers/hip-common/default.nix            | 4 ++--
 pkgs/development/compilers/hipcc/default.nix                 | 4 ++--
 pkgs/development/compilers/hipify/default.nix                | 4 ++--
 pkgs/development/compilers/llvm/rocm/llvm.nix                | 4 ++--
 pkgs/development/libraries/clang-ocl/default.nix             | 2 +-
 pkgs/development/libraries/hipblas/default.nix               | 4 ++--
 pkgs/development/libraries/hipcub/default.nix                | 4 ++--
 pkgs/development/libraries/hipfft/default.nix                | 4 ++--
 pkgs/development/libraries/hipfort/default.nix               | 4 ++--
 pkgs/development/libraries/hipsolver/default.nix             | 4 ++--
 pkgs/development/libraries/hipsparse/default.nix             | 4 ++--
 pkgs/development/libraries/migraphx/default.nix              | 4 ++--
 pkgs/development/libraries/miopen/default.nix                | 4 ++--
 pkgs/development/libraries/rccl/default.nix                  | 4 ++--
 pkgs/development/libraries/rocalution/default.nix            | 4 ++--
 pkgs/development/libraries/rocblas/default.nix               | 4 ++--
 pkgs/development/libraries/rocdbgapi/default.nix             | 4 ++--
 pkgs/development/libraries/rocfft/default.nix                | 4 ++--
 pkgs/development/libraries/rocm-comgr/default.nix            | 4 ++--
 pkgs/development/libraries/rocm-device-libs/default.nix      | 4 ++--
 pkgs/development/libraries/rocm-runtime/default.nix          | 4 ++--
 pkgs/development/libraries/rocm-thunk/default.nix            | 4 ++--
 pkgs/development/libraries/rocmlir/default.nix               | 4 ++--
 pkgs/development/libraries/rocprim/default.nix               | 4 ++--
 pkgs/development/libraries/rocprofiler/default.nix           | 4 ++--
 pkgs/development/libraries/rocr-debug-agent/default.nix      | 4 ++--
 pkgs/development/libraries/rocrand/default.nix               | 4 ++--
 pkgs/development/libraries/rocsolver/default.nix             | 4 ++--
 pkgs/development/libraries/rocsparse/default.nix             | 4 ++--
 pkgs/development/libraries/rocthrust/default.nix             | 4 ++--
 pkgs/development/libraries/roctracer/default.nix             | 4 ++--
 pkgs/development/libraries/rocwmma/default.nix               | 4 ++--
 pkgs/development/libraries/tensile/default.nix               | 4 ++--
 pkgs/development/tools/build-managers/rocm-cmake/default.nix | 4 ++--
 pkgs/development/tools/misc/rdc/default.nix                  | 4 ++--
 pkgs/development/tools/misc/rocgdb/default.nix               | 4 ++--
 pkgs/development/tools/rocminfo/default.nix                  | 4 ++--
 pkgs/tools/system/rocm-smi/default.nix                       | 4 ++--
 38 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/pkgs/development/compilers/hip-common/default.nix b/pkgs/development/compilers/hip-common/default.nix
index 1721091d6a60..754fea89ac5e 100644
--- a/pkgs/development/compilers/hip-common/default.nix
+++ b/pkgs/development/compilers/hip-common/default.nix
@@ -11,13 +11,13 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "hip-common";
-  version = "5.4.2";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "ROCm-Developer-Tools";
     repo = "HIP";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-44CZWk6EsP5EduzBCBbOh2kshS89qOm4v3mx/xNDzV0=";
+    hash = "sha256-1Abit9qZCwrCVcnaFT4uMygFB9G6ovRasLmTsOsJ/Fw=";
   };
 
   patches = [
diff --git a/pkgs/development/compilers/hipcc/default.nix b/pkgs/development/compilers/hipcc/default.nix
index af0cb35c1480..b758d0e1ed96 100644
--- a/pkgs/development/compilers/hipcc/default.nix
+++ b/pkgs/development/compilers/hipcc/default.nix
@@ -12,13 +12,13 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "hipcc";
-  version = "5.4.2";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "ROCm-Developer-Tools";
     repo = "HIPCC";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-PEwue4O43MiMkF8UmTeHsmlikBG2V3/nFQLKmtHrRWQ=";
+    hash = "sha256-lJX6nF1V4YmK5ai7jivXlRnG3doIOf6X9CWLHVdRuVg=";
   };
 
   patches = [
diff --git a/pkgs/development/compilers/hipify/default.nix b/pkgs/development/compilers/hipify/default.nix
index 342e8e7e8bd4..d7b243b9da04 100644
--- a/pkgs/development/compilers/hipify/default.nix
+++ b/pkgs/development/compilers/hipify/default.nix
@@ -8,13 +8,13 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "hipify";
-  version = "5.4.2";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "ROCm-Developer-Tools";
     repo = "HIPIFY";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-EaHtI1ywjEHioWptuHvCllJ3dENtSClVoE6NpWTOa9I=";
+    hash = "sha256-lCQ2VTUGmFC90Xu70/tvoeDhFaInGqLT3vC2A1UojNI=";
   };
 
   nativeBuildInputs = [ cmake ];
diff --git a/pkgs/development/compilers/llvm/rocm/llvm.nix b/pkgs/development/compilers/llvm/rocm/llvm.nix
index 5475f411304b..7fa0bbd35eea 100644
--- a/pkgs/development/compilers/llvm/rocm/llvm.nix
+++ b/pkgs/development/compilers/llvm/rocm/llvm.nix
@@ -53,7 +53,7 @@ let
   llvmTargetsToBuild' = [ "AMDGPU" ] ++ builtins.map inferNativeTarget llvmTargetsToBuild;
 in stdenv.mkDerivation (finalAttrs: {
   pname = "rocm-llvm-${targetName}";
-  version = "5.4.4";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -70,7 +70,7 @@ in stdenv.mkDerivation (finalAttrs: {
     owner = "RadeonOpenCompute";
     repo = "llvm-project";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-BDvC6QFDFtahA9hmJDLiM6K4mrO3j9E9rEXm7KulcuA=";
+    hash = "sha256-oJIXALwxo130jl8b6yCFw+a2kMBlny5/0ubiqF6MOWY=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/clang-ocl/default.nix b/pkgs/development/libraries/clang-ocl/default.nix
index 8053b672d366..96fc4945747f 100644
--- a/pkgs/development/libraries/clang-ocl/default.nix
+++ b/pkgs/development/libraries/clang-ocl/default.nix
@@ -9,7 +9,7 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "clang-ocl";
-  version = "5.4.2";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "RadeonOpenCompute";
diff --git a/pkgs/development/libraries/hipblas/default.nix b/pkgs/development/libraries/hipblas/default.nix
index e8402c0d05bd..845c5b9d0d7d 100644
--- a/pkgs/development/libraries/hipblas/default.nix
+++ b/pkgs/development/libraries/hipblas/default.nix
@@ -18,7 +18,7 @@
 # Can also use cuBLAS
 stdenv.mkDerivation (finalAttrs: {
   pname = "hipblas";
-  version = "5.4.3";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -34,7 +34,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "hipBLAS";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-mSZCq8UaiffMzWVflW1nAX6CQZ1DqwWJaSIzKslZSEk=";
+    hash = "sha256-abaEZN82dsoEC5gIF3/6epRDVz5ItUo6CkZsybu/G+g=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/hipcub/default.nix b/pkgs/development/libraries/hipcub/default.nix
index d0f33c0dc1d2..fff34e1a0ec7 100644
--- a/pkgs/development/libraries/hipcub/default.nix
+++ b/pkgs/development/libraries/hipcub/default.nix
@@ -15,7 +15,7 @@
 # CUB can also be used as a backend instead of rocPRIM.
 stdenv.mkDerivation (finalAttrs: {
   pname = "hipcub";
-  version = "5.4.4";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -29,7 +29,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "hipCUB";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-reFxSOYQOf9QcoZzaLt4D1yKGQoDxpt/3rwiHgP1DCo=";
+    hash = "sha256-ygBEA3NuCQ13QrSzGqyWXkx8Dy9WhR3u4syzapRTkFU=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/hipfft/default.nix b/pkgs/development/libraries/hipfft/default.nix
index c4d13cb4f55e..c208296c687b 100644
--- a/pkgs/development/libraries/hipfft/default.nix
+++ b/pkgs/development/libraries/hipfft/default.nix
@@ -20,7 +20,7 @@
 # Can also use cuFFT
 stdenv.mkDerivation (finalAttrs: {
   pname = "hipfft";
-  version = "5.4.3";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -36,7 +36,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "hipFFT";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-yDtm9J0wqH6zo4HcgQbqhvwbzbOiJPQ48gJ2gC8PvjA=";
+    hash = "sha256-fuYRKdlTrRMwxr3cgMeT3YniPzs4nuvF8YCzr3LLPFM=";
     fetchSubmodules = true;
   };
 
diff --git a/pkgs/development/libraries/hipfort/default.nix b/pkgs/development/libraries/hipfort/default.nix
index 5c5f0f61e81c..4bb2a270271b 100644
--- a/pkgs/development/libraries/hipfort/default.nix
+++ b/pkgs/development/libraries/hipfort/default.nix
@@ -9,13 +9,13 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "hipfort";
-  version = "5.6.0";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "ROCmSoftwarePlatform";
     repo = "hipfort";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-x1pF9md7RIcobE/4UxHxOaURbljFZGOashW1KM0lmo0=";
+    hash = "sha256-DRjUWhdinDKP7CZgq2SmU3lGmmodCuXvco9aEeMLSZ4=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/hipsolver/default.nix b/pkgs/development/libraries/hipsolver/default.nix
index ff499b342f26..cd689856d418 100644
--- a/pkgs/development/libraries/hipsolver/default.nix
+++ b/pkgs/development/libraries/hipsolver/default.nix
@@ -18,7 +18,7 @@
 # Can also use cuSOLVER
 stdenv.mkDerivation (finalAttrs: {
   pname = "hipsolver";
-  version = "5.4.4";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -34,7 +34,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "hipSOLVER";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-p9hgKqRALLItv/HTpVlTsu+m9wlwCBYPYnJcm8StIao=";
+    hash = "sha256-I9Xjkilo+baeM1CRXjLAbj/vrg8r5/E2yEImhHGSyf8=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/hipsparse/default.nix b/pkgs/development/libraries/hipsparse/default.nix
index 4f4b0c7cdc00..45a571735b70 100644
--- a/pkgs/development/libraries/hipsparse/default.nix
+++ b/pkgs/development/libraries/hipsparse/default.nix
@@ -17,7 +17,7 @@
 # This can also use cuSPARSE as a backend instead of rocSPARSE
 stdenv.mkDerivation (finalAttrs: {
   pname = "hipsparse";
-  version = "5.4.4";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -31,7 +31,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "hipSPARSE";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-JWjmMvqIm4in1aPq2UgYmL0eWjrrRBiU6vH3FnCZZ40=";
+    hash = "sha256-txigaOoZMI/v+EQLgGlj2O0IHfE7EpgjL0cyv49nKzo=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/migraphx/default.nix b/pkgs/development/libraries/migraphx/default.nix
index 3b32b86f41af..2a842a3c24dd 100644
--- a/pkgs/development/libraries/migraphx/default.nix
+++ b/pkgs/development/libraries/migraphx/default.nix
@@ -46,7 +46,7 @@ let
   };
 in stdenv.mkDerivation (finalAttrs: {
   pname = "migraphx";
-  version = "5.4.3";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -60,7 +60,7 @@ in stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "AMDMIGraphX";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-UDhm+j9qs4Rk81C1PE4kkacytfY2StYbfsCOtFL+p6s=";
+    hash = "sha256-7yL7Zn5I8GUPIAgB7tVLZI7OEHLv0E4FcLVx9xMfsNY=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/miopen/default.nix b/pkgs/development/libraries/miopen/default.nix
index 1b24d8bfec73..5345c2216e66 100644
--- a/pkgs/development/libraries/miopen/default.nix
+++ b/pkgs/development/libraries/miopen/default.nix
@@ -53,7 +53,7 @@ let
   };
 in stdenv.mkDerivation (finalAttrs: {
   pname = "miopen";
-  version = "5.4.2";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -67,7 +67,7 @@ in stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "MIOpen";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-GfXPCXiVJVve3d8sQCQcFLb/vEnKkVEn7xYUhHkEEVI=";
+    hash = "sha256-6Bz4yDbQtV8XlEpwbH0YsJFaaZqH7BOfZDL7F4JTS1Q=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/rccl/default.nix b/pkgs/development/libraries/rccl/default.nix
index b3aaaff82f08..acd0030cabd9 100644
--- a/pkgs/development/libraries/rccl/default.nix
+++ b/pkgs/development/libraries/rccl/default.nix
@@ -13,7 +13,7 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rccl";
-  version = "5.4.3";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -25,7 +25,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "rccl";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-hQTzaiPMo5FAVScmxV0iNhy80uJ1xvx/kzlbfwROOs4=";
+    hash = "sha256-Abrwmsjnkx9JVTrARP/BM965g+R10lY+XPwthy/SG0k=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/rocalution/default.nix b/pkgs/development/libraries/rocalution/default.nix
index 2796215d0311..f67384a95f08 100644
--- a/pkgs/development/libraries/rocalution/default.nix
+++ b/pkgs/development/libraries/rocalution/default.nix
@@ -21,7 +21,7 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocalution";
-  version = "5.4.3";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -37,7 +37,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "rocALUTION";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-jovhodhNa7tr1bSqpZCKI/9xF7Ie96JB+giqAEfis2k=";
+    hash = "sha256-+UGpFuZsC4+kmo8LWZWC2YoFJSdTukjN47e1YqW5Zu4=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/rocblas/default.nix b/pkgs/development/libraries/rocblas/default.nix
index 11385ae64032..78d0e7df8b24 100644
--- a/pkgs/development/libraries/rocblas/default.nix
+++ b/pkgs/development/libraries/rocblas/default.nix
@@ -27,7 +27,7 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocblas";
-  version = "5.4.3";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -41,7 +41,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "rocBLAS";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-XhYpzBXviMnUdbF6lZi9g0LARKpzWLtDxJxLI3MuHiM=";
+    hash = "sha256-3wKnwvAra8u9xqlC05wUD+gSoBILTVJFU2cIV6xv3Lk=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/rocdbgapi/default.nix b/pkgs/development/libraries/rocdbgapi/default.nix
index dfeb9249914c..a1cfacf33c27 100644
--- a/pkgs/development/libraries/rocdbgapi/default.nix
+++ b/pkgs/development/libraries/rocdbgapi/default.nix
@@ -37,7 +37,7 @@ let
   };
 in stdenv.mkDerivation (finalAttrs: {
   pname = "rocdbgapi";
-  version = "5.4.2";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -49,7 +49,7 @@ in stdenv.mkDerivation (finalAttrs: {
     owner = "ROCm-Developer-Tools";
     repo = "ROCdbgapi";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-KoFa6JzoEPT5/ns9X/hMfu8bOh29HD9n2qGJ3gzhiBA=";
+    hash = "sha256-qMXvgcS61lgcylz62ErYq8fhpYIR31skQEeKUryuP1w=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/rocfft/default.nix b/pkgs/development/libraries/rocfft/default.nix
index d1136d4be8e6..8eed31b8b233 100644
--- a/pkgs/development/libraries/rocfft/default.nix
+++ b/pkgs/development/libraries/rocfft/default.nix
@@ -83,13 +83,13 @@ let
 in
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocfft";
-  version = "5.4.3";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "ROCmSoftwarePlatform";
     repo = "rocFFT";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-FsefE0B2hF5ZcHDB6TscwFeZ1NKFkWX7VDpEvvbDbOk=";
+    hash = "sha256-GZSi03geTT+NUztBWhGYyghLqJGsFjUQzVAKQ7d03uA=";
   };
 
   patches = [
diff --git a/pkgs/development/libraries/rocm-comgr/default.nix b/pkgs/development/libraries/rocm-comgr/default.nix
index 6dc7b87934f1..4d84af3afa54 100644
--- a/pkgs/development/libraries/rocm-comgr/default.nix
+++ b/pkgs/development/libraries/rocm-comgr/default.nix
@@ -15,13 +15,13 @@ let
     else throw "Unsupported ROCm LLVM platform";
 in stdenv.mkDerivation (finalAttrs: {
   pname = "rocm-comgr";
-  version = "5.4.4";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "RadeonOpenCompute";
     repo = "ROCm-CompilerSupport";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-qLsrBTeSop7lIQv8gZDwgpvGZJOAq90zsvMi1QpfbAs=";
+    hash = "sha256-QB3G0V92UTW67hD6+zSuExN1+eMT820iYSlMyZeWSFw=";
   };
 
   patches = [ ./cmake.patch ];
diff --git a/pkgs/development/libraries/rocm-device-libs/default.nix b/pkgs/development/libraries/rocm-device-libs/default.nix
index 92e84fe14195..594e21031284 100644
--- a/pkgs/development/libraries/rocm-device-libs/default.nix
+++ b/pkgs/development/libraries/rocm-device-libs/default.nix
@@ -14,13 +14,13 @@ let
     else throw "Unsupported ROCm LLVM platform";
 in stdenv.mkDerivation (finalAttrs: {
   pname = "rocm-device-libs";
-  version = "5.4.4";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "RadeonOpenCompute";
     repo = "ROCm-Device-Libs";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-8gxvgy2GlROxM5qKtZVu5Lxa1FmTIVlBTpfp8rxhNhk=";
+    hash = "sha256-f6/LAhJ2mBDO1/JloHvl7MJyDo3WutbXd4IDknA9nzM=";
   };
 
   patches = [ ./cmake.patch ];
diff --git a/pkgs/development/libraries/rocm-runtime/default.nix b/pkgs/development/libraries/rocm-runtime/default.nix
index d10f7811ccbb..dfb10c363153 100644
--- a/pkgs/development/libraries/rocm-runtime/default.nix
+++ b/pkgs/development/libraries/rocm-runtime/default.nix
@@ -16,13 +16,13 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocm-runtime";
-  version = "5.4.3";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "RadeonOpenCompute";
     repo = "ROCR-Runtime";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-JkTXTQmdESHSFbA6HZdMK3pYEApz9aoAlMzdXayzdyY=";
+    hash = "sha256-D7Ahan5cxDhqPtV5iDDNys0A4FlxQ9oVRa2EeMoY5Qk=";
   };
 
   sourceRoot = "${finalAttrs.src.name}/src";
diff --git a/pkgs/development/libraries/rocm-thunk/default.nix b/pkgs/development/libraries/rocm-thunk/default.nix
index 86f0044799c0..8a4ad2a098c6 100644
--- a/pkgs/development/libraries/rocm-thunk/default.nix
+++ b/pkgs/development/libraries/rocm-thunk/default.nix
@@ -13,13 +13,13 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocm-thunk";
-  version = "5.4.4";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "RadeonOpenCompute";
     repo = "ROCT-Thunk-Interface";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-EU5toaKzVeZpdm/YhaQ0bXq0eoYwYQ5qGLUJzxgZVjE=";
+    hash = "sha256-jAMBks2/JaXiA45B3qvLHY8fPeFcr1GHT5Jieuduqhw=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/rocmlir/default.nix b/pkgs/development/libraries/rocmlir/default.nix
index b8dee1385436..a2a4923148a0 100644
--- a/pkgs/development/libraries/rocmlir/default.nix
+++ b/pkgs/development/libraries/rocmlir/default.nix
@@ -25,7 +25,7 @@ let
     else throw "Unsupported ROCm LLVM platform";
 in stdenv.mkDerivation (finalAttrs: {
   pname = "rocmlir";
-  version = "5.4.1";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -37,7 +37,7 @@ in stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "rocMLIR";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-MokE7Ej8mLHTQeLYvKr7PPlsNG6ul91fqfXDlGu5JpI=";
+    hash = "sha256-vPi4UVljohVAfnwDVQqeOVaJPa6v8aV5uBOtqLddTtc=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/rocprim/default.nix b/pkgs/development/libraries/rocprim/default.nix
index 9b6ed7edc476..b38684b24dd6 100644
--- a/pkgs/development/libraries/rocprim/default.nix
+++ b/pkgs/development/libraries/rocprim/default.nix
@@ -13,7 +13,7 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocprim";
-  version = "5.4.3";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -27,7 +27,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "rocPRIM";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-Sqr3lbDMK1Gwucqmr/CHoxw/L6bGj3wlXoHzKTnTqoc=";
+    hash = "sha256-+ukFWsWv3RhS+Z6tmR4TRT8QTYEDuAEk12F9Gv1eXGU=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/rocprofiler/default.nix b/pkgs/development/libraries/rocprofiler/default.nix
index e7e0c9fed650..97f269beb84e 100644
--- a/pkgs/development/libraries/rocprofiler/default.nix
+++ b/pkgs/development/libraries/rocprofiler/default.nix
@@ -11,13 +11,13 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocprofiler";
-  version = "5.4.3";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "ROCm-Developer-Tools";
     repo = "rocprofiler";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-CpD/+soMN8WTeSb5X7dsnZ596PMkw+4EVsVSvFtKCak=";
+    hash = "sha256-ue/2uiLbhOv/5XY4cIJuZ8DUMRhniYgxolq9xMwO1FY=";
   };
 
   patches = [ ./0000-dont-require-hsa_amd_aqlprofile.patch ];
diff --git a/pkgs/development/libraries/rocr-debug-agent/default.nix b/pkgs/development/libraries/rocr-debug-agent/default.nix
index 4361ffec3454..08d45f304a4f 100644
--- a/pkgs/development/libraries/rocr-debug-agent/default.nix
+++ b/pkgs/development/libraries/rocr-debug-agent/default.nix
@@ -12,13 +12,13 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocr-debug-agent";
-  version = "5.4.2";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "ROCm-Developer-Tools";
     repo = "rocr_debug_agent";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-5l6svWSWCxVoyr1zJabxbt5rXQMtdZtHrf9gS2PcRKc=";
+    hash = "sha256-AUDbNrFtUQ5Hm+uv5KMovh7P9wXQKLyRNx9gEQFnv6Y=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/rocrand/default.nix b/pkgs/development/libraries/rocrand/default.nix
index 5128e17376c3..8ea138d4c3cd 100644
--- a/pkgs/development/libraries/rocrand/default.nix
+++ b/pkgs/development/libraries/rocrand/default.nix
@@ -13,7 +13,7 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocrand";
-  version = "5.4.3";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -27,7 +27,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "rocRAND";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-xK1JRTW+7odlXRQV9WC6ZfXqRKow/TQ9grHCigw+/us=";
+    hash = "sha256-cFH38fLD8tk6V9JERcqHokuwKemdDgHCZ75bZNEqmdY=";
     fetchSubmodules = true; # For inline hipRAND
   };
 
diff --git a/pkgs/development/libraries/rocsolver/default.nix b/pkgs/development/libraries/rocsolver/default.nix
index 632e93fa6c0d..c78b4d97a0ae 100644
--- a/pkgs/development/libraries/rocsolver/default.nix
+++ b/pkgs/development/libraries/rocsolver/default.nix
@@ -17,7 +17,7 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocsolver";
-  version = "5.4.4";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -31,7 +31,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "rocSOLVER";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-UHUcA9CVPuYFpE2DTvRrRMMj51yNPo5wMTKnByL2RTg=";
+    hash = "sha256-qxmjm4tgpCnfJ2SqUXndk6y0MsPJUKHvjv/3Uc0smr4=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/rocsparse/default.nix b/pkgs/development/libraries/rocsparse/default.nix
index d821ee693880..a93d7a77bf26 100644
--- a/pkgs/development/libraries/rocsparse/default.nix
+++ b/pkgs/development/libraries/rocsparse/default.nix
@@ -18,7 +18,7 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocsparse";
-  version = "5.4.3";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -32,7 +32,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "rocSPARSE";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-jzHD55c4rlPab5IAj2UzHTJI9MKhTfevsLthSZKOEzQ=";
+    hash = "sha256-30q9bqgZJUaNrkMXTAG+Z34yjsQ5DpJP+WBcCiEmF58=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/rocthrust/default.nix b/pkgs/development/libraries/rocthrust/default.nix
index 45099cb9f4c1..a4981d3fb270 100644
--- a/pkgs/development/libraries/rocthrust/default.nix
+++ b/pkgs/development/libraries/rocthrust/default.nix
@@ -13,7 +13,7 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocthrust";
-  version = "5.4.3";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -27,7 +27,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "rocThrust";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-JT2PX53N39H+EaThPHo2ol+BUjDQniSQlKMLiYD8NoM=";
+    hash = "sha256-i0XCtJth8caVQT5oUgsxWXNzcePa02Gb7AQsthYTOv8=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/roctracer/default.nix b/pkgs/development/libraries/roctracer/default.nix
index a81f7dc18961..3aeb8e3ba198 100644
--- a/pkgs/development/libraries/roctracer/default.nix
+++ b/pkgs/development/libraries/roctracer/default.nix
@@ -19,7 +19,7 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "roctracer";
-  version = "5.4.3";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -33,7 +33,7 @@ stdenv.mkDerivation (finalAttrs: {
     owner = "ROCm-Developer-Tools";
     repo = "roctracer";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-5vYUNczylB2ehlvhq1u/H8KUXt8ku2E+jawKrKsU7LY=";
+    hash = "sha256-P6QYyAjMRwFFWKF8AhbrYGe+mYVJXdbBW1or6vcobYU=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/rocwmma/default.nix b/pkgs/development/libraries/rocwmma/default.nix
index 08667b6d3c92..84db5b4dbebf 100644
--- a/pkgs/development/libraries/rocwmma/default.nix
+++ b/pkgs/development/libraries/rocwmma/default.nix
@@ -36,7 +36,7 @@ let
   };
 in stdenv.mkDerivation (finalAttrs: {
   pname = "rocwmma";
-  version = "5.4.3";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -54,7 +54,7 @@ in stdenv.mkDerivation (finalAttrs: {
     owner = "ROCmSoftwarePlatform";
     repo = "rocWMMA";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-HUJPb6IahBgl/v+W4kXludBTNAjRm8k6v0jxKAX+qZM=";
+    hash = "sha256-/EuBBSjhlMwJfsqYvRb9oCNC0hNkEa1JH1KUDLMSs08=";
   };
 
   patches = lib.optionals (buildTests || buildBenchmarks) [
diff --git a/pkgs/development/libraries/tensile/default.nix b/pkgs/development/libraries/tensile/default.nix
index c6117167855f..7d0165a42060 100644
--- a/pkgs/development/libraries/tensile/default.nix
+++ b/pkgs/development/libraries/tensile/default.nix
@@ -10,13 +10,13 @@
 
 buildPythonPackage rec {
   pname = "tensile";
-  version = "5.4.2";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "ROCmSoftwarePlatform";
     repo = "Tensile";
     rev = "rocm-${version}";
-    hash = "sha256-W6yr6mptfsiJSSzPCImgqI1EmsUv+l99SjqkoZsOjag=";
+    hash = "sha256-CyPGiM/53duJc/oNtOsl6JSsl9uOOYm5R7O6YXaVOm4=";
   };
 
   buildInputs = [
diff --git a/pkgs/development/tools/build-managers/rocm-cmake/default.nix b/pkgs/development/tools/build-managers/rocm-cmake/default.nix
index 206038c0b6e6..9e9cf3caf12e 100644
--- a/pkgs/development/tools/build-managers/rocm-cmake/default.nix
+++ b/pkgs/development/tools/build-managers/rocm-cmake/default.nix
@@ -7,13 +7,13 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocm-cmake";
-  version = "5.4.4";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "RadeonOpenCompute";
     repo = "rocm-cmake";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-JarQqiiZ36WV1d6vyQD546GN1EtoKLcdvcZsG3QWD2Y=";
+    hash = "sha256-aVjzuJ4BiSfwOdjufFc5CznfnL8di5h992zl+pzD0DU=";
   };
 
   nativeBuildInputs = [ cmake ];
diff --git a/pkgs/development/tools/misc/rdc/default.nix b/pkgs/development/tools/misc/rdc/default.nix
index 906f35b47c52..d2a7f46dc849 100644
--- a/pkgs/development/tools/misc/rdc/default.nix
+++ b/pkgs/development/tools/misc/rdc/default.nix
@@ -41,7 +41,7 @@ let
   };
 in stdenv.mkDerivation (finalAttrs: {
   pname = "rdc";
-  version = "5.4.2";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -55,7 +55,7 @@ in stdenv.mkDerivation (finalAttrs: {
     owner = "RadeonOpenCompute";
     repo = "rdc";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-dYacqkRp+zVejo/4dME1K6EN8t/1EBtIynEQ+AQ4JZo=";
+    hash = "sha256-xZD/WI/LfNtKK9j6ZjuU0OTTFZz3G4atyD5mVcSsQ8A=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/tools/misc/rocgdb/default.nix b/pkgs/development/tools/misc/rocgdb/default.nix
index 8775ca6d2d68..a2f4435ee7bc 100644
--- a/pkgs/development/tools/misc/rocgdb/default.nix
+++ b/pkgs/development/tools/misc/rocgdb/default.nix
@@ -15,13 +15,13 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocgdb";
-  version = "5.4.2";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "ROCm-Developer-Tools";
     repo = "ROCgdb";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-DORPvfon32+rIk+YcO9LlUefNvvC7trmiTswg9MMuIs=";
+    hash = "sha256-TlT7vvTrVd7P6ilVnWIG5VIrjTleFgDezK/mudBV+xE=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/development/tools/rocminfo/default.nix b/pkgs/development/tools/rocminfo/default.nix
index 61488b806e88..c9ff79e380ff 100644
--- a/pkgs/development/tools/rocminfo/default.nix
+++ b/pkgs/development/tools/rocminfo/default.nix
@@ -18,14 +18,14 @@
 }:
 
 stdenv.mkDerivation (finalAttrs: {
-  version = "5.4.4";
+  version = "5.7.0";
   pname = "rocminfo";
 
   src = fetchFromGitHub {
     owner = "RadeonOpenCompute";
     repo = "rocminfo";
     rev = "rocm-${finalAttrs.version}";
-    sha256 = "sha256-4wZTm5AZgG8xEd6uYqxWq4bWZgcSYZ2WYA1z4RAPF8U=";
+    sha256 = "sha256-UzOo2qDT/uM+vdGdBM4pV5e143mfa+/6sZLBExOO26g=";
   };
 
   nativeBuildInputs = [
diff --git a/pkgs/tools/system/rocm-smi/default.nix b/pkgs/tools/system/rocm-smi/default.nix
index 7e3f1fb29cc0..2fa79828c63b 100644
--- a/pkgs/tools/system/rocm-smi/default.nix
+++ b/pkgs/tools/system/rocm-smi/default.nix
@@ -8,13 +8,13 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocm-smi";
-  version = "5.4.4";
+  version = "5.7.0";
 
   src = fetchFromGitHub {
     owner = "RadeonOpenCompute";
     repo = "rocm_smi_lib";
     rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-nkidiDNNU6MGhne9EbYClkODJZw/zZu3LWzlniJKyJE=";
+    hash = "sha256-swCRO4PBMBJ6fO2bLq/xxFZIYw2IgiFB490wsU8Wm2o=";
   };
 
   postPatch = ''

From d2bc96e5b5f004885c589ab92201bb77cca30366 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sat, 30 Sep 2023 11:56:54 -0500
Subject: [PATCH 04/30] llvmPackages_rocm: fixup for 5.7.0

llvmPackages_rocm.llvm: fixup for 5.7.0

llvmPackages_rocm.clang-unwrapped: fixup for 5.7.0

llvmPackages_rocm.lld: fixup for 5.7.0

llvmPackages_rocm.runtimes: fixup for 5.7.0

llvmPackages_rocm.libc: fixup for 5.7.0

llvmPackages_rocm.libunwind: fixup for 5.7.0

llvmPackages_rocm.libcxx: fixup for 5.7.0

llvmPackages_rocm.compiler-rt: fixup for 5.7.0

llvmPackages_rocm.clang: fixup for 5.7.0

llvmPackages_rocm.lldb: fixup for 5.7.0

llvmPackages_rocm.polly: fixup for 5.7.0
---
 .../llvm/rocm/1000-libcxx-failing-tests.list  | 171 ++++++++++++++++++
 .../compilers/llvm/rocm/default.nix           |  81 ++++++---
 pkgs/development/compilers/llvm/rocm/llvm.nix |  16 +-
 3 files changed, 234 insertions(+), 34 deletions(-)
 create mode 100644 pkgs/development/compilers/llvm/rocm/1000-libcxx-failing-tests.list

diff --git a/pkgs/development/compilers/llvm/rocm/1000-libcxx-failing-tests.list b/pkgs/development/compilers/llvm/rocm/1000-libcxx-failing-tests.list
new file mode 100644
index 000000000000..e005d6c928c2
--- /dev/null
+++ b/pkgs/development/compilers/llvm/rocm/1000-libcxx-failing-tests.list
@@ -0,0 +1,171 @@
+../libcxx/test/libcxx/containers/gnu_cxx/hash_map.pass.cpp
+../libcxx/test/libcxx/containers/gnu_cxx/hash_set.pass.cpp
+../libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.cxx2a.pass.cpp
+../libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/construct.cxx2a.pass.cpp
+../libcxx/test/libcxx/input.output/filesystems/class.directory_entry/directory_entry.mods/last_write_time.pass.cpp
+../libcxx/test/libcxx/input.output/filesystems/class.path/path.member/path.native.obs/string_alloc.pass.cpp
+../libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
+../libcxx/test/libcxx/localization/locales/locale/locale.types/locale.facet/no_allocation.pass.cpp
+../libcxx/test/libcxx/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.mem/allocate_from_underaligned_buffer.pass.cpp
+../libcxx/test/libcxx/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.mem/allocate_in_geometric_progression.pass.cpp
+../libcxx/test/std/containers/associative/map/map.access/index_key.pass.cpp
+../libcxx/test/std/containers/associative/map/map.access/index_rv_key.pass.cpp
+../libcxx/test/std/containers/associative/map/map.modifiers/insert_and_emplace_allocator_requirements.pass.cpp
+../libcxx/test/std/containers/associative/multimap/multimap.modifiers/insert_allocator_requirements.pass.cpp
+../libcxx/test/std/containers/associative/multiset/insert_emplace_allocator_requirements.pass.cpp
+../libcxx/test/std/containers/associative/set/insert_and_emplace_allocator_requirements.pass.cpp
+../libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_iter_iter.pass.cpp
+../libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_size_value.pass.cpp
+../libcxx/test/std/containers/sequences/list/list.modifiers/insert_iter_value.pass.cpp
+../libcxx/test/std/containers/sequences/vector.bool/ctor_exceptions.pass.cpp
+../libcxx/test/std/containers/sequences/vector/vector.cons/exceptions.pass.cpp
+../libcxx/test/std/containers/unord/unord.map/unord.map.elem/index.pass.cpp
+../libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/insert_and_emplace_allocator_requirements.pass.cpp
+../libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/insert_allocator_requirements.pass.cpp
+../libcxx/test/std/containers/unord/unord.multiset/insert_emplace_allocator_requirements.pass.cpp
+../libcxx/test/std/containers/unord/unord.set/insert_and_emplace_allocator_requirements.pass.cpp
+../libcxx/test/std/experimental/memory/memory.resource.global/new_delete_resource.pass.cpp
+../libcxx/test/std/experimental/memory/memory.resource.global/null_memory_resource.pass.cpp
+../libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/pbackfail.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy_assign.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/copy.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move_assign.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/move.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.cons/path.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/assign.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/refresh.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.mods/replace_filename.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_size.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/file_type_obs.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/hard_link_count.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/last_write_time.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/status.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_entry/directory_entry.obs/symlink_status.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy_assign.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/copy.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/ctor.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/increment.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move_assign.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.members/move.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.directory_iterator/directory_iterator.nonmembers/begin_end.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.append.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.assign/move.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.assign/source.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.compare.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.concat.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.construct/move.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.decompose/path.decompose.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.gen/lexically_normal.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.gen/lexically_relative_and_proximate.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.generic.obs/generic_string_alloc.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.generic.obs/named_overloads.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.modifiers/clear.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.modifiers/make_preferred.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.modifiers/remove_filename.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.modifiers/replace_extension.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.modifiers/replace_filename.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.modifiers/swap.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.member/path.native.obs/named_overloads.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.nonmember/path.factory.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.nonmember/path.io.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.path/path.nonmember/swap.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy_assign.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/copy.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/ctor.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/depth.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/disable_recursion_pending.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/increment.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move_assign.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/move.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/pop.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.members/recursion_pending.pass.cpp
+../libcxx/test/std/input.output/filesystems/class.rec.dir.itr/rec.dir.itr.nonmembers/begin_end.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.canonical/canonical.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file_large.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_file/copy_file.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy_symlink/copy_symlink.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.copy/copy.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directories/create_directories.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory_symlink/create_directory_symlink.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory_with_attributes.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_directory/create_directory.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_hard_link/create_hard_link.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.create_symlink/create_symlink.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.current_path/current_path.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.equivalent/equivalent.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.file_size/file_size.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.hard_lk_ct/hard_link_count.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_block_file/is_block_file.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_char_file/is_character_file.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_directory/is_directory.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_empty/is_empty.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_fifo/is_fifo.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_other/is_other.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_regular_file/is_regular_file.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_socket/is_socket.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.is_symlink/is_symlink.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.last_write_time/last_write_time.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.permissions/permissions.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.proximate/proximate.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.read_symlink/read_symlink.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/remove_all.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/toctou.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove/remove.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.resize_file/resize_file.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.space/space.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.status/status.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.symlink_status/symlink_status.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.temp_dir_path/temp_directory_path.pass.cpp
+../libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp
+../libcxx/test/std/localization/locale.categories/category.ctype/facet.ctype.special/facet.ctype.char.dtor/dtor.pass.cpp
+../libcxx/test/std/localization/locale.stdcvt/codecvt_utf16.pass.cpp
+../libcxx/test/std/localization/locale.stdcvt/codecvt_utf8.pass.cpp
+../libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/ctor.pass.cpp
+../libcxx/test/std/localization/locales/locale/locale.members/combine.pass.cpp
+../libcxx/test/std/strings/basic.string/string.cons/substr_rvalue.pass.cpp
+../libcxx/test/std/utilities/any/any.class/any.assign/copy.pass.cpp
+../libcxx/test/std/utilities/any/any.class/any.assign/value.pass.cpp
+../libcxx/test/std/utilities/any/any.class/any.cons/copy.pass.cpp
+../libcxx/test/std/utilities/any/any.class/any.cons/default.pass.cpp
+../libcxx/test/std/utilities/any/any.class/any.cons/in_place_type.pass.cpp
+../libcxx/test/std/utilities/any/any.class/any.cons/move.pass.cpp
+../libcxx/test/std/utilities/any/any.class/any.cons/value.pass.cpp
+../libcxx/test/std/utilities/any/any.class/any.modifiers/emplace.pass.cpp
+../libcxx/test/std/utilities/any/any.nonmembers/any.cast/any_cast_reference.pass.cpp
+../libcxx/test/std/utilities/any/any.nonmembers/make_any.pass.cpp
+../libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.alg/swap.pass.cpp
+../libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/copy_assign.pass.cpp
+../libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/copy_move.pass.cpp
+../libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/F_assign.pass.cpp
+../libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/F.pass.cpp
+../libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/nullptr_t_assign.pass.cpp
+../libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.mod/swap.pass.cpp
+../libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate_at_least.pass.cpp
+../libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.pass.cpp
+../libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.enab/enable_shared_from_this.pass.cpp
+../libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/nullptr_t_deleter_throw.pass.cpp
+../libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_deleter_throw.pass.cpp
+../libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/pointer_throw.pass.cpp
+../libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/unique_ptr.pass.cpp
+../libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/make_shared.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.global/new_delete_resource.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.global/null_memory_resource.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.ctor/without_buffer.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.mem/allocate_deallocate.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.mem/allocate_exception_safety.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.mem/allocate_from_initial_buffer.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.mem/allocate_from_zero_sized_buffer.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.mem/allocate_in_geometric_progression.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.mem/allocate_overaligned_request.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.monotonic.buffer/mem.res.monotonic.buffer.mem/allocate_with_initial_size.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.ctor/ctor_does_not_allocate.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/equality.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/sync_allocate_overaligned_request.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/sync_allocate_reuse_blocks.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/sync_allocate.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/unsync_allocate_overaligned_request.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/unsync_allocate_reuse_blocks.pass.cpp
+../libcxx/test/std/utilities/utility/mem.res/mem.res.pool/mem.res.pool.mem/unsync_allocate.pass.cpp
diff --git a/pkgs/development/compilers/llvm/rocm/default.nix b/pkgs/development/compilers/llvm/rocm/default.nix
index b6df2354f5ec..be2b7e4de375 100644
--- a/pkgs/development/compilers/llvm/rocm/default.nix
+++ b/pkgs/development/compilers/llvm/rocm/default.nix
@@ -22,6 +22,7 @@
 , rocm-device-libs
 , rocm-runtime
 , elfutils
+, graphviz
 , python3Packages
 }:
 
@@ -40,7 +41,6 @@ let
     extraBuildInputs = [ llvm ];
 
     extraCMakeFlags = [
-      "-DCMAKE_POLICY_DEFAULT_CMP0116=NEW"
       "-DCLANG_INCLUDE_DOCS=ON"
       "-DCLANG_INCLUDE_TESTS=ON"
     ];
@@ -59,6 +59,12 @@ let
       # `does not depend on a module exporting 'baz.h'`
       rm test/Modules/header-attribs.cpp
 
+      # We do not have HIP or the ROCm stack available yet
+      rm test/Driver/hip-options.hip
+
+      # ???? `ld: cannot find crti.o: No such file or directory` linker issue?
+      rm test/Interpreter/dynamic-library.cpp
+
       # `fatal error: 'stdio.h' file not found`
       rm test/OpenMP/amdgcn_emit_llvm.c
     '';
@@ -73,19 +79,18 @@ let
     targetName = "lld";
     targetDir = targetName;
     extraBuildInputs = [ llvm ];
-    extraCMakeFlags = [ "-DCMAKE_POLICY_DEFAULT_CMP0116=NEW" ];
-    checkTargets = [ "check-lld" ];
+    checkTargets = [ "check-${targetName}" ];
   };
 
   # Runtimes
-  runtimes = callPackage ./llvm.nix {
+  runtimes = callPackage ./llvm.nix rec {
     buildDocs = false;
     buildMan = false;
     buildTests = false;
-    targetDir = "runtimes";
+    targetName = "runtimes";
+    targetDir = targetName;
 
     targetRuntimes = [
-      # "libc" https://github.com/llvm/llvm-project/issues/57719
       "libunwind"
       "libcxxabi"
       "libcxx"
@@ -95,7 +100,6 @@ let
     extraBuildInputs = [ llvm ];
 
     extraCMakeFlags = [
-      "-DCMAKE_POLICY_DEFAULT_CMP0114=NEW"
       "-DLIBCXX_INCLUDE_BENCHMARKS=OFF"
       "-DLIBCXX_CXX_ABI=libcxxabi"
     ];
@@ -167,10 +171,24 @@ in rec {
   # Runtimes
   libc = callPackage ./llvm.nix rec {
     stdenv = rStdenv;
+    buildMan = false; # No man pages to build
     targetName = "libc";
     targetDir = "runtimes";
     targetRuntimes = [ targetName ];
-    isBroken = true; # https://github.com/llvm/llvm-project/issues/57719
+
+    extraPostPatch = ''
+      # `Failed to match ... against ...` `Match value not within tolerance value of MPFR result:`
+      # We need a better way, but I don't know enough sed magic and patching `CMakeLists.txt` isn't working...
+      substituteInPlace ../libc/test/src/math/log10_test.cpp \
+        --replace "i < N" "i < 0" \
+        --replace "test(mpfr::RoundingMode::Nearest);" "" \
+        --replace "test(mpfr::RoundingMode::Downward);" "" \
+        --replace "test(mpfr::RoundingMode::Upward);" "" \
+        --replace "test(mpfr::RoundingMode::TowardZero);" ""
+    '';
+
+    checkTargets = [ "check-${targetName}" ];
+    hardeningDisable = [ "fortify" ]; # Prevent `error: "Assumed value of MB_LEN_MAX wrong"`
   };
 
   libunwind = callPackage ./llvm.nix rec {
@@ -185,6 +203,14 @@ in rec {
       "-DLIBUNWIND_INCLUDE_TESTS=ON"
       "-DLIBUNWIND_USE_COMPILER_RT=ON"
     ];
+
+    extraPostPatch = ''
+      # `command had no output on stdout or stderr` (Says these unsupported tests)
+      chmod +w -R ../libunwind/test
+      rm ../libunwind/test/floatregister.pass.cpp
+      rm ../libunwind/test/unwind_leaffunction.pass.cpp
+      rm ../libunwind/test/libunwind_02.pass.cpp
+    '';
   };
 
   libcxxabi = callPackage ./llvm.nix rec {
@@ -254,14 +280,7 @@ in rec {
     # Most of these can't find `bash` or `mkdir`, might just be hard-coded paths, or PATH is altered
     extraPostPatch = ''
       chmod +w -R ../libcxx/test/{libcxx,std}
-      rm -rf ../libcxx/test/libcxx/input.output/filesystems
-      rm ../libcxx/test/libcxx/selftest/remote-substitutions.sh.cpp
-      rm ../libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/pbackfail.pass.cpp
-      rm ../libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/pbackfail.pass.cpp
-      rm ../libcxx/test/std/utilities/optional/optional.object/optional.object.assign/emplace_initializer_list.pass.cpp
-      rm ../libcxx/test/std/utilities/optional/optional.object/optional.object.assign/nullopt_t.pass.cpp
-      rm -rf ../libcxx/test/std/utilities/optional/optional.object/optional.object.ctor
-      rm -rf ../libcxx/test/std/input.output/filesystems/{class.directory_entry,class.directory_iterator,class.rec.dir.itr,fs.op.funcs}
+      cat ${./1000-libcxx-failing-tests.list} | xargs -d \\n rm
     '';
   };
 
@@ -280,7 +299,6 @@ in rec {
     ];
 
     extraCMakeFlags = [
-      "-DCMAKE_POLICY_DEFAULT_CMP0114=NEW"
       "-DCOMPILER_RT_INCLUDE_TESTS=ON"
       "-DCOMPILER_RT_USE_LLVM_UNWINDER=ON"
       "-DCOMPILER_RT_CXX_LIBRARY=libcxx"
@@ -313,6 +331,10 @@ in rec {
       # We can run these
       substituteInPlace ../compiler-rt/test/CMakeLists.txt \
         --replace "endfunction()" "endfunction()''\nadd_subdirectory(builtins)''\nadd_subdirectory(shadowcallstack)"
+
+      # Could not launch llvm-config in /build/source/runtimes/build/bin
+      mkdir -p build/bin
+      ln -s ${llvm}/bin/llvm-config build/bin
     '';
 
     extraLicenses = [ lib.licenses.mit ];
@@ -323,7 +345,6 @@ in rec {
   rocmClangStdenv = overrideCC stdenv clang;
 
   clang = wrapCCWith rec {
-    # inherit libc libcxx bintools;
     inherit libcxx bintools;
 
     # We do this to avoid HIP pathing problems, and mimic a monolithic install
@@ -337,14 +358,14 @@ in rec {
         clang_version=`${clang-unwrapped}/bin/clang -v 2>&1 | grep "clang version " | grep -E -o "[0-9.-]+"`
         mkdir -p $out/{bin,include/c++/v1,lib/{cmake,clang/$clang_version/{include,lib}},libexec,share}
 
-        for path in ${llvm} ${clang-unwrapped} ${lld} ${libunwind} ${libcxxabi} ${libcxx} ${compiler-rt}; do
+        for path in ${llvm} ${clang-unwrapped} ${lld} ${libc} ${libunwind} ${libcxxabi} ${libcxx} ${compiler-rt}; do
           cp -as $path/* $out
           chmod +w $out/{*,include/c++/v1,lib/{clang/$clang_version/include,cmake}}
           rm -f $out/lib/libc++.so
         done
 
         ln -s $out/lib/* $out/lib/clang/$clang_version/lib
-        ln -s $out/include/* $out/lib/clang/$clang_version/include
+        ln -sf $out/include/* $out/lib/clang/$clang_version/include
 
         runHook postInstall
       '';
@@ -355,6 +376,7 @@ in rec {
     extraPackages = [
       llvm
       lld
+      libc
       libunwind
       libcxxabi
       compiler-rt
@@ -374,8 +396,7 @@ in rec {
       ln -s ${cc}/lib/clang/$clang_version/{include,lib} $out/resource-root
 
       # Not sure why, but hardening seems to make things break
-      rm $out/nix-support/add-hardening.sh
-      touch $out/nix-support/add-hardening.sh
+      echo "" > $out/nix-support/add-hardening.sh
 
       # GPU compilation uses builtin `lld`
       substituteInPlace $out/bin/{clang,clang++} \
@@ -459,13 +480,20 @@ in rec {
       swig
       lua5_3
       gtest
+      graphviz
     ];
 
     extraCMakeFlags = [
-      "-DLLVM_EXTERNAL_LIT=${lit}/bin/.lit-wrapped"
+      "-DLLDB_EXTERNAL_CLANG_RESOURCE_DIR=${clang}/resource-root/lib/clang/$clang_version"
       "-DLLDB_INCLUDE_TESTS=ON"
       "-DLLDB_INCLUDE_UNITTESTS=ON"
     ];
+
+    extraPostPatch = ''
+      export clang_version=`clang -v 2>&1 | grep "clang version " | grep -E -o "[0-9.-]+"`
+    '';
+
+    checkTargets = [ "check-${targetName}" ];
   };
 
   mlir = callPackage ./llvm.nix rec {
@@ -527,6 +555,13 @@ in rec {
     stdenv = rocmClangStdenv;
     targetName = "polly";
     targetDir = targetName;
+
+    extraPostPatch = ''
+      # `add_library cannot create target "llvm_gtest" because an imported target with the same name already exists`
+      substituteInPlace CMakeLists.txt \
+        --replace "NOT TARGET gtest" "FALSE"
+    '';
+
     checkTargets = [ "check-${targetName}" ];
   };
 
diff --git a/pkgs/development/compilers/llvm/rocm/llvm.nix b/pkgs/development/compilers/llvm/rocm/llvm.nix
index 7fa0bbd35eea..75aa7741a083 100644
--- a/pkgs/development/compilers/llvm/rocm/llvm.nix
+++ b/pkgs/development/compilers/llvm/rocm/llvm.nix
@@ -24,8 +24,7 @@
 , targetDir ? "llvm"
 , targetProjects ? [ ]
 , targetRuntimes ? [ ]
-# "NATIVE" resolves into x86 or aarch64 depending on stdenv
-, llvmTargetsToBuild ? [ "NATIVE" ]
+, llvmTargetsToBuild ? [ "NATIVE" ] # "NATIVE" resolves into x86 or aarch64 depending on stdenv
 , extraPatches ? [ ]
 , extraNativeBuildInputs ? [ ]
 , extraBuildInputs ? [ ]
@@ -108,22 +107,20 @@ in stdenv.mkDerivation (finalAttrs: {
     "-DLLVM_ENABLE_PROJECTS=${lib.concatStringsSep ";" targetProjects}"
   ] ++ lib.optionals ((finalAttrs.passthru.isLLVM || targetDir == "runtimes") && targetRuntimes != [ ]) [
     "-DLLVM_ENABLE_RUNTIMES=${lib.concatStringsSep ";" targetRuntimes}"
-  ] ++ lib.optionals (finalAttrs.passthru.isLLVM || finalAttrs.passthru.isClang) [
-    "-DLLVM_ENABLE_RTTI=ON"
-    "-DLLVM_ENABLE_EH=ON"
+  ] ++ lib.optionals finalAttrs.passthru.isLLVM [
+    "-DLLVM_INSTALL_UTILS=ON"
+    "-DLLVM_INSTALL_GTEST=ON"
   ] ++ lib.optionals (buildDocs || buildMan) [
     "-DLLVM_INCLUDE_DOCS=ON"
     "-DLLVM_BUILD_DOCS=ON"
     # "-DLLVM_ENABLE_DOXYGEN=ON" Way too slow, only uses one core
     "-DLLVM_ENABLE_SPHINX=ON"
-    "-DLLVM_ENABLE_OCAMLDOC=OFF"
     "-DSPHINX_OUTPUT_HTML=ON"
     "-DSPHINX_OUTPUT_MAN=ON"
     "-DSPHINX_WARNINGS_AS_ERRORS=OFF"
   ] ++ lib.optionals buildTests [
     "-DLLVM_INCLUDE_TESTS=ON"
     "-DLLVM_BUILD_TESTS=ON"
-  ] ++ lib.optionals (buildTests && !finalAttrs.passthru.isLLVM) [
     "-DLLVM_EXTERNAL_LIT=${lit}/bin/.lit-wrapped"
   ] ++ extraCMakeFlags;
 
@@ -141,10 +138,7 @@ in stdenv.mkDerivation (finalAttrs: {
   doCheck = buildTests;
   checkTarget = lib.concatStringsSep " " checkTargets;
 
-  postInstall = lib.optionalString finalAttrs.passthru.isLLVM ''
-    # `lit` expects these for some test suites
-    mv bin/{FileCheck,not,count,yaml2obj,obj2yaml} $out/bin
-  '' + lib.optionalString buildMan ''
+  postInstall = lib.optionalString buildMan ''
     mkdir -p $info
   '' + extraPostInstall;
 

From 95768ef3b680afbb54aa01dee19d87c79a50ff20 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sat, 30 Sep 2023 12:13:59 -0500
Subject: [PATCH 05/30] llvmPackages_rocm: add big-parallel for
 clang-unwrapped, clang-tools-extra, and mlir

---
 pkgs/development/compilers/llvm/rocm/default.nix | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pkgs/development/compilers/llvm/rocm/default.nix b/pkgs/development/compilers/llvm/rocm/default.nix
index be2b7e4de375..045fcdae4803 100644
--- a/pkgs/development/compilers/llvm/rocm/default.nix
+++ b/pkgs/development/compilers/llvm/rocm/default.nix
@@ -72,6 +72,8 @@ let
     extraPostInstall = ''
       mv bin/clang-tblgen $out/bin
     '';
+
+    requiredSystemFeatures = [ "big-parallel" ];
   };
 
   lld = callPackage ./llvm.nix rec {
@@ -437,6 +439,8 @@ in rec {
       # Cleanup empty directories
       find $out -type d -empty -delete
     '';
+
+    requiredSystemFeatures = [ "big-parallel" ];
   };
 
   # Projects
@@ -549,6 +553,7 @@ in rec {
     '';
 
     checkTargets = [ "check-${targetName}" ];
+    requiredSystemFeatures = [ "big-parallel" ];
   };
 
   polly = callPackage ./llvm.nix rec {

From c9132cc3be4165275d7aa0c8ffc2805df6d01fb5 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sat, 30 Sep 2023 17:31:19 -0500
Subject: [PATCH 06/30] llvmPackages_rocm: add hardeningDisable pass-in option

---
 pkgs/development/compilers/llvm/rocm/llvm.nix | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pkgs/development/compilers/llvm/rocm/llvm.nix b/pkgs/development/compilers/llvm/rocm/llvm.nix
index 75aa7741a083..655192d892bb 100644
--- a/pkgs/development/compilers/llvm/rocm/llvm.nix
+++ b/pkgs/development/compilers/llvm/rocm/llvm.nix
@@ -38,6 +38,7 @@
   )
 )]
 , extraPostInstall ? ""
+, hardeningDisable ? [ ]
 , requiredSystemFeatures ? [ ]
 , extraLicenses ? [ ]
 , isBroken ? false
@@ -153,7 +154,7 @@ in stdenv.mkDerivation (finalAttrs: {
     };
   };
 
-  inherit requiredSystemFeatures;
+  inherit hardeningDisable requiredSystemFeatures;
 
   meta = with lib; {
     description = "ROCm fork of the LLVM compiler infrastructure";

From 117639a256c933269c9888cd54085d5f61619f42 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sat, 30 Sep 2023 21:42:33 -0500
Subject: [PATCH 07/30] llvmPackages_rocm.libclc: mark broken due to ROCm 5.7.0
 LLVM not being up-to-date with LLVM upstream

---
 pkgs/development/compilers/llvm/rocm/default.nix | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkgs/development/compilers/llvm/rocm/default.nix b/pkgs/development/compilers/llvm/rocm/default.nix
index 045fcdae4803..f8bfade2b0c1 100644
--- a/pkgs/development/compilers/llvm/rocm/default.nix
+++ b/pkgs/development/compilers/llvm/rocm/default.nix
@@ -470,6 +470,7 @@ in rec {
     '';
 
     checkTargets = [ ];
+    isBroken = true; # ROCm 5.7.0 doesn't have IR/AttributeMask.h yet...?
   };
 
   lldb = callPackage ./llvm.nix rec {

From 68ca0c26d820689228518758d1ff782645d20532 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sat, 30 Sep 2023 22:34:52 -0500
Subject: [PATCH 08/30] llvmPackages_rocm.lldb: disable tests

---
 pkgs/development/compilers/llvm/rocm/default.nix | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkgs/development/compilers/llvm/rocm/default.nix b/pkgs/development/compilers/llvm/rocm/default.nix
index f8bfade2b0c1..5fe10f956ca6 100644
--- a/pkgs/development/compilers/llvm/rocm/default.nix
+++ b/pkgs/development/compilers/llvm/rocm/default.nix
@@ -475,7 +475,7 @@ in rec {
 
   lldb = callPackage ./llvm.nix rec {
     stdenv = rocmClangStdenv;
-    buildTests = false; # ld.lld: error: unable to find library -lllvm_gtest_main
+    buildTests = false; # FIXME: Bad pathing for clang executable in tests, using relative path most likely
     targetName = "lldb";
     targetDir = targetName;
     extraNativeBuildInputs = [ python3Packages.sphinx-automodapi ];

From 7cb34ce34dee08b1bed6828366db87746d960cb0 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sat, 30 Sep 2023 00:09:04 -0500
Subject: [PATCH 09/30] rocm-core: Use official rocm-core repo

---
 .../libraries/rocm-core/default.nix           | 60 +++++---------
 .../libraries/rocm-core/src/CMakeLists.txt    |  5 --
 .../libraries/rocm-core/src/rocm_version.c    | 10 ---
 .../libraries/rocm-core/src/rocm_version.h    | 82 -------------------
 4 files changed, 19 insertions(+), 138 deletions(-)
 delete mode 100644 pkgs/development/libraries/rocm-core/src/CMakeLists.txt
 delete mode 100644 pkgs/development/libraries/rocm-core/src/rocm_version.c
 delete mode 100644 pkgs/development/libraries/rocm-core/src/rocm_version.h

diff --git a/pkgs/development/libraries/rocm-core/default.nix b/pkgs/development/libraries/rocm-core/default.nix
index 9f1a4ab4ccf5..aae431be8e62 100644
--- a/pkgs/development/libraries/rocm-core/default.nix
+++ b/pkgs/development/libraries/rocm-core/default.nix
@@ -1,58 +1,36 @@
 { lib
 , stdenv
 , fetchFromGitHub
-, runCommand
-, substituteAll
+, rocmUpdateScript
 , cmake
 }:
 
-let
-  rocm_version = with lib; concatStrings (intersperse "0" (splitString "." stdenv.cc.version));
-in stdenv.mkDerivation (finalAttrs: {
+stdenv.mkDerivation (finalAttrs: {
   pname = "rocm-core";
-  version = stdenv.cc.version;
+  version = "5.7.0";
 
-  # Based on https://github.com/rocm-arch/rocm-arch/tree/ad0b15690d403e5822db062ffff4db3912de6669/rocm-core
-  src = let
-    rocm_major = lib.versions.major finalAttrs.version;
-    rocm_minor = lib.versions.minor finalAttrs.version;
-    rocm_patch = lib.versions.patch finalAttrs.version;
-
-    cmake_lists = substituteAll {
-      inherit rocm_version;
-      src = ./src/CMakeLists.txt;
-    };
-
-    version_c = substituteAll {
-      inherit rocm_major rocm_minor rocm_patch;
-      src = ./src/rocm_version.c;
-    };
-
-    version_h = substituteAll {
-      inherit rocm_major rocm_minor rocm_patch;
-      src = ./src/rocm_version.h;
-    };
-  in runCommand "rocm-core-${finalAttrs.version}-source" { preferLocalBuild = true; } ''
-    mkdir -p $out/rocm-core
-    ln -s ${cmake_lists} $out/CMakeLists.txt
-    ln -s ${version_c} $out/rocm_version.c
-    ln -s ${version_h} $out/rocm-core/rocm_version.h
-  '';
+  src = fetchFromGitHub {
+    owner = "RadeonOpenCompute";
+    repo = "rocm-core";
+    rev = "rocm-${finalAttrs.version}";
+    hash = "sha256-jFAHLqf/AR27Nbuq8aypWiKqApNcTgG5LWESVjVCKIg=";
+  };
 
   nativeBuildInputs = [ cmake ];
+  cmakeFlags = [ "-DROCM_VERSION=${finalAttrs.version}" ];
 
-  postInstall = ''
-    mkdir -p $out/include
-    cp -a ../rocm-core $out/include
-    ln -s $out/include/rocm-core/rocm_version.h $out/include
-    ln -s $out/lib/librocm-core.so.1.0.${rocm_version} $out/lib/librocm-core.so.1
-  '';
+  passthru.updateScript = rocmUpdateScript {
+    name = finalAttrs.pname;
+    owner = finalAttrs.src.owner;
+    repo = finalAttrs.src.repo;
+  };
 
   meta = with lib; {
-    description = "ROCm core";
-    homepage = "https://docs.amd.com";
-    license = with licenses; [ ncsa ]; # See src/rocm_version.h
+    description = "Utility for getting the ROCm release version";
+    homepage = "https://github.com/RadeonOpenCompute/rocm-core";
+    license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/rocm-core/src/CMakeLists.txt b/pkgs/development/libraries/rocm-core/src/CMakeLists.txt
deleted file mode 100644
index b2d9fc1f53b8..000000000000
--- a/pkgs/development/libraries/rocm-core/src/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-cmake_minimum_required(VERSION 3.23)
-project(rocm-core)
-add_library(rocm-core SHARED rocm_version.c)
-set_target_properties(rocm-core PROPERTIES VERSION "1.0.@rocm_version@")
-install(TARGETS rocm-core LIBRARY DESTINATION lib)
diff --git a/pkgs/development/libraries/rocm-core/src/rocm_version.c b/pkgs/development/libraries/rocm-core/src/rocm_version.c
deleted file mode 100644
index a35dfc6a72c8..000000000000
--- a/pkgs/development/libraries/rocm-core/src/rocm_version.c
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "rocm-core/rocm_version.h"
-
-VerErrors getROCmVersion(unsigned int *Major, unsigned int *Minor,
-                         unsigned int *Patch) {
-  *Major = @rocm_major@;
-  *Minor = @rocm_minor@;
-  *Patch = @rocm_patch@;
-
-  return 0;
-}
diff --git a/pkgs/development/libraries/rocm-core/src/rocm_version.h b/pkgs/development/libraries/rocm-core/src/rocm_version.h
deleted file mode 100644
index d112a68b8653..000000000000
--- a/pkgs/development/libraries/rocm-core/src/rocm_version.h
+++ /dev/null
@@ -1,82 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-//
-// The University of Illinois/NCSA
-// Open Source License (NCSA)
-//
-// Copyright (c) 2014-2021, Advanced Micro Devices, Inc. All rights reserved.
-//
-// Developed by:
-//
-//                 AMD Research and AMD HSA Software Development
-//
-//                 Advanced Micro Devices, Inc.
-//
-//                 www.amd.com
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal with the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-//  - Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimers.
-//  - Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimers in
-//    the documentation and/or other materials provided with the distribution.
-//  - Neither the names of Advanced Micro Devices, Inc,
-//    nor the names of its contributors may be used to endorse or promote
-//    products derived from this Software without specific prior written
-//    permission.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS WITH THE SOFTWARE.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-
-#ifndef _ROCM_VERSION_H_
-#define _ROCM_VERSION_H_
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif  /* __cplusplus */
-
-
-#define ROCM_VERSION_MAJOR   @rocm_major@
-#define ROCM_VERSION_MINOR   @rocm_minor@
-#define ROCM_VERSION_PATCH   @rocm_patch@
-
-
-typedef enum {
-	VerSuccess=0,
-	VerIncorrecPararmeters,
-	VerValuesNotDefined,
-	VerErrorMAX		//This should always be last value in the enumerations
-} VerErrors;
-
-
-//  API for getting the verion
-//  Return val :  VerErros : API execution status.  The parameters are valid only when the exetution status is SUCCESS==0
-VerErrors getROCmVersion(unsigned int* Major, unsigned int* Minor, unsigned int* Patch) __attribute__((nonnull)) ;
-//  Usage :
-//  int mj=0,mn=0,p=0,ret=0;
-//  ret=getROCMVersion(&mj,&mn,&p);
-//  if(ret !=VerSuccess )  // error occured
-//
-//  check for the values and
-//
-
-
-#ifdef __cplusplus
-}  // end extern "C" block
-#endif
-
-#endif  //_ROCM_VERSION_H_  header guard

From 41ab6719307b93606f6c2d99914ff92d50d46ad2 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sat, 30 Sep 2023 04:52:46 +0000
Subject: [PATCH 10/30] miopengemm: 5.4.3 -> 5.5.0

---
 pkgs/development/libraries/miopengemm/default.nix | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkgs/development/libraries/miopengemm/default.nix b/pkgs/development/libraries/miopengemm/default.nix
index f288cfda5456..92616f79eedc 100644
--- a/pkgs/development/libraries/miopengemm/default.nix
+++ b/pkgs/development/libraries/miopengemm/default.nix
@@ -31,7 +31,7 @@ let
   };
 in stdenv.mkDerivation (finalAttrs: {
   pname = "miopengemm";
-  version = "5.4.3";
+  version = "5.5.0";
 
   outputs = [
     "out"

From 12e7fc6923688a08152ee937d00e272502d2ed63 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sun, 1 Oct 2023 19:36:12 -0500
Subject: [PATCH 11/30] llvmPackages_rocm -> rocmPackages.llvm

---
 .../compilers/llvm/rocm/0000-fix-openmp.patch |  18 -
 .../compilers/llvm/rocm/default.nix           | 644 ------------------
 pkgs/development/rocm-modules/5/default.nix   |   9 +
 .../llvm.nix => rocm-modules/5/llvm/base.nix} |   0
 .../rocm-modules/5/llvm/default.nix           |  53 ++
 .../5/llvm/stage-1/clang-unwrapped.nix        |  46 ++
 .../rocm-modules/5/llvm/stage-1/lld.nix       |  13 +
 .../rocm-modules/5/llvm/stage-1/llvm.nix      |  10 +
 .../rocm-modules/5/llvm/stage-1/runtimes.nix  |  30 +
 .../stage-2}/1000-libcxx-failing-tests.list   |   0
 .../5/llvm/stage-2/bintools-unwrapped.nix     |  28 +
 .../5/llvm/stage-2/compiler-rt.nix            |  63 ++
 .../rocm-modules/5/llvm/stage-2/libc.nix      |  26 +
 .../rocm-modules/5/llvm/stage-2/libcxx.nix    |  42 ++
 .../rocm-modules/5/llvm/stage-2/libcxxabi.nix |  37 +
 .../rocm-modules/5/llvm/stage-2/libunwind.nix |  26 +
 .../rocm-modules/5/llvm/stage-2/rstdenv.nix   |  35 +
 .../5/llvm/stage-3/clang-tools-extra.nix      |  42 ++
 .../rocm-modules/5/llvm/stage-3/clang.nix     |  73 ++
 .../rocm-modules/5/llvm/stage-3/flang.nix     |  33 +
 .../rocm-modules/5/llvm/stage-3/libclc.nix    |  36 +
 .../rocm-modules/5/llvm/stage-3/lldb.nix      |  39 ++
 .../rocm-modules/5/llvm/stage-3/mlir.nix      |  67 ++
 .../rocm-modules/5/llvm/stage-3/openmp.nix    |  45 ++
 .../rocm-modules/5/llvm/stage-3/polly.nix     |  18 +
 .../rocm-modules/5/llvm/stage-3/pstl.nix      |  15 +
 .../default.nix => 5/update.nix}              |   2 +-
 pkgs/test/default.nix                         |   2 +-
 pkgs/top-level/all-packages.nix               |  36 +-
 pkgs/top-level/python-packages.nix            |   4 +-
 30 files changed, 807 insertions(+), 685 deletions(-)
 delete mode 100644 pkgs/development/compilers/llvm/rocm/0000-fix-openmp.patch
 delete mode 100644 pkgs/development/compilers/llvm/rocm/default.nix
 create mode 100644 pkgs/development/rocm-modules/5/default.nix
 rename pkgs/development/{compilers/llvm/rocm/llvm.nix => rocm-modules/5/llvm/base.nix} (100%)
 create mode 100644 pkgs/development/rocm-modules/5/llvm/default.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-1/clang-unwrapped.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-1/lld.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-1/llvm.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-1/runtimes.nix
 rename pkgs/development/{compilers/llvm/rocm => rocm-modules/5/llvm/stage-2}/1000-libcxx-failing-tests.list (100%)
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-2/bintools-unwrapped.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-2/compiler-rt.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-2/libc.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-2/libcxx.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-2/libcxxabi.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-2/libunwind.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-2/rstdenv.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-3/clang-tools-extra.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-3/clang.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-3/flang.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-3/libclc.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-3/lldb.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-3/mlir.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-3/openmp.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-3/polly.nix
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-3/pstl.nix
 rename pkgs/development/rocm-modules/{update-script/default.nix => 5/update.nix} (91%)

diff --git a/pkgs/development/compilers/llvm/rocm/0000-fix-openmp.patch b/pkgs/development/compilers/llvm/rocm/0000-fix-openmp.patch
deleted file mode 100644
index 2811df7d29f7..000000000000
--- a/pkgs/development/compilers/llvm/rocm/0000-fix-openmp.patch
+++ /dev/null
@@ -1,18 +0,0 @@
-diff --git a/libomptarget/plugins/amdgpu/impl/impl.cpp b/libomptarget/plugins/amdgpu/impl/impl.cpp
-index 80e024789..3a14e0889 100644
---- a/libomptarget/plugins/amdgpu/impl/impl.cpp
-+++ b/libomptarget/plugins/amdgpu/impl/impl.cpp
-@@ -21,10 +21,11 @@ bool is_locked(void *ptr, hsa_status_t *err_p, void **agentBaseAddress) {
-   info.size = sizeof(hsa_amd_pointer_info_t);
-   err = hsa_amd_pointer_info(ptr, &info, nullptr, nullptr, nullptr);
- 
--  if (err != HSA_STATUS_SUCCESS)
-+  if (err != HSA_STATUS_SUCCESS) {
-     DP("Error when getting pointer info\n");
--  else
-+  } else {
-     is_locked = (info.type == HSA_EXT_POINTER_TYPE_LOCKED);
-+  }
- 
-   if (is_locked && agentBaseAddress != nullptr) {
-     // When user passes in a basePtr+offset we need to fix the
diff --git a/pkgs/development/compilers/llvm/rocm/default.nix b/pkgs/development/compilers/llvm/rocm/default.nix
deleted file mode 100644
index 5fe10f956ca6..000000000000
--- a/pkgs/development/compilers/llvm/rocm/default.nix
+++ /dev/null
@@ -1,644 +0,0 @@
-{ lib
-, stdenv
-, callPackage
-, overrideCC
-, wrapCCWith
-, wrapBintoolsWith
-, runCommand
-, lit
-, glibc
-, spirv-llvm-translator
-, xz
-, swig
-, lua5_3
-, gtest
-, hip
-, rocm-comgr
-, vulkan-loader
-, vulkan-headers
-, glslang
-, shaderc
-, perl
-, rocm-device-libs
-, rocm-runtime
-, elfutils
-, graphviz
-, python3Packages
-}:
-
-let
-  # Stage 1
-  # Base
-  llvm = callPackage ./llvm.nix {
-    requiredSystemFeatures = [ "big-parallel" ];
-    isBroken = stdenv.isAarch64; # https://github.com/RadeonOpenCompute/ROCm/issues/1831#issuecomment-1278205344
-  };
-
-  # Projects
-  clang-unwrapped = callPackage ./llvm.nix rec {
-    targetName = "clang";
-    targetDir = targetName;
-    extraBuildInputs = [ llvm ];
-
-    extraCMakeFlags = [
-      "-DCLANG_INCLUDE_DOCS=ON"
-      "-DCLANG_INCLUDE_TESTS=ON"
-    ];
-
-    extraPostPatch = ''
-      # Looks like they forgot to add finding libedit to the standalone build
-      ln -s ../cmake/Modules/FindLibEdit.cmake cmake/modules
-
-      substituteInPlace CMakeLists.txt \
-        --replace "include(CheckIncludeFile)" "include(CheckIncludeFile)''\nfind_package(LibEdit)"
-
-      # `No such file or directory: '/build/source/clang/tools/scan-build/bin/scan-build'`
-      rm test/Analysis/scan-build/*.test
-      rm test/Analysis/scan-build/rebuild_index/rebuild_index.test
-
-      # `does not depend on a module exporting 'baz.h'`
-      rm test/Modules/header-attribs.cpp
-
-      # We do not have HIP or the ROCm stack available yet
-      rm test/Driver/hip-options.hip
-
-      # ???? `ld: cannot find crti.o: No such file or directory` linker issue?
-      rm test/Interpreter/dynamic-library.cpp
-
-      # `fatal error: 'stdio.h' file not found`
-      rm test/OpenMP/amdgcn_emit_llvm.c
-    '';
-
-    extraPostInstall = ''
-      mv bin/clang-tblgen $out/bin
-    '';
-
-    requiredSystemFeatures = [ "big-parallel" ];
-  };
-
-  lld = callPackage ./llvm.nix rec {
-    buildMan = false; # No man pages to build
-    targetName = "lld";
-    targetDir = targetName;
-    extraBuildInputs = [ llvm ];
-    checkTargets = [ "check-${targetName}" ];
-  };
-
-  # Runtimes
-  runtimes = callPackage ./llvm.nix rec {
-    buildDocs = false;
-    buildMan = false;
-    buildTests = false;
-    targetName = "runtimes";
-    targetDir = targetName;
-
-    targetRuntimes = [
-      "libunwind"
-      "libcxxabi"
-      "libcxx"
-      "compiler-rt"
-    ];
-
-    extraBuildInputs = [ llvm ];
-
-    extraCMakeFlags = [
-      "-DLIBCXX_INCLUDE_BENCHMARKS=OFF"
-      "-DLIBCXX_CXX_ABI=libcxxabi"
-    ];
-
-    extraLicenses = [ lib.licenses.mit ];
-  };
-
-  # Stage 2
-  # Helpers
-  rStdenv = overrideCC stdenv (wrapCCWith rec {
-    inherit bintools;
-    libcxx = runtimes;
-    cc = clang-unwrapped;
-
-    extraPackages = [
-      llvm
-      lld
-    ];
-
-    nixSupport.cc-cflags = [
-      "-resource-dir=$out/resource-root"
-      "-fuse-ld=lld"
-      "-rtlib=compiler-rt"
-      "-unwindlib=libunwind"
-      "-Wno-unused-command-line-argument"
-    ];
-
-    extraBuildCommands = ''
-      clang_version=`${cc}/bin/clang -v 2>&1 | grep "clang version " | grep -E -o "[0-9.-]+"`
-      mkdir -p $out/resource-root
-      ln -s ${cc}/lib/clang/$clang_version/include $out/resource-root
-      ln -s ${runtimes}/lib $out/resource-root
-    '';
-  });
-
-  bintools = wrapBintoolsWith { bintools = bintools-unwrapped; };
-
-  bintools-unwrapped = runCommand "rocm-llvm-binutils-${llvm.version}" { preferLocalBuild = true; } ''
-    mkdir -p $out/bin
-
-    for prog in ${lld}/bin/*; do
-      ln -s $prog $out/bin/$(basename $prog)
-    done
-
-    for prog in ${llvm}/bin/*; do
-      ln -sf $prog $out/bin/$(basename $prog)
-    done
-
-    ln -s ${llvm}/bin/llvm-ar $out/bin/ar
-    ln -s ${llvm}/bin/llvm-as $out/bin/as
-    ln -s ${llvm}/bin/llvm-dwp $out/bin/dwp
-    ln -s ${llvm}/bin/llvm-nm $out/bin/nm
-    ln -s ${llvm}/bin/llvm-objcopy $out/bin/objcopy
-    ln -s ${llvm}/bin/llvm-objdump $out/bin/objdump
-    ln -s ${llvm}/bin/llvm-ranlib $out/bin/ranlib
-    ln -s ${llvm}/bin/llvm-readelf $out/bin/readelf
-    ln -s ${llvm}/bin/llvm-size $out/bin/size
-    ln -s ${llvm}/bin/llvm-strip $out/bin/strip
-    ln -s ${lld}/bin/lld $out/bin/ld
-  '';
-in rec {
-  inherit
-  llvm
-  clang-unwrapped
-  lld
-  bintools
-  bintools-unwrapped;
-
-  # Runtimes
-  libc = callPackage ./llvm.nix rec {
-    stdenv = rStdenv;
-    buildMan = false; # No man pages to build
-    targetName = "libc";
-    targetDir = "runtimes";
-    targetRuntimes = [ targetName ];
-
-    extraPostPatch = ''
-      # `Failed to match ... against ...` `Match value not within tolerance value of MPFR result:`
-      # We need a better way, but I don't know enough sed magic and patching `CMakeLists.txt` isn't working...
-      substituteInPlace ../libc/test/src/math/log10_test.cpp \
-        --replace "i < N" "i < 0" \
-        --replace "test(mpfr::RoundingMode::Nearest);" "" \
-        --replace "test(mpfr::RoundingMode::Downward);" "" \
-        --replace "test(mpfr::RoundingMode::Upward);" "" \
-        --replace "test(mpfr::RoundingMode::TowardZero);" ""
-    '';
-
-    checkTargets = [ "check-${targetName}" ];
-    hardeningDisable = [ "fortify" ]; # Prevent `error: "Assumed value of MB_LEN_MAX wrong"`
-  };
-
-  libunwind = callPackage ./llvm.nix rec {
-    stdenv = rStdenv;
-    buildMan = false; # No man pages to build
-    targetName = "libunwind";
-    targetDir = "runtimes";
-    targetRuntimes = [ targetName ];
-
-    extraCMakeFlags = [
-      "-DLIBUNWIND_INCLUDE_DOCS=ON"
-      "-DLIBUNWIND_INCLUDE_TESTS=ON"
-      "-DLIBUNWIND_USE_COMPILER_RT=ON"
-    ];
-
-    extraPostPatch = ''
-      # `command had no output on stdout or stderr` (Says these unsupported tests)
-      chmod +w -R ../libunwind/test
-      rm ../libunwind/test/floatregister.pass.cpp
-      rm ../libunwind/test/unwind_leaffunction.pass.cpp
-      rm ../libunwind/test/libunwind_02.pass.cpp
-    '';
-  };
-
-  libcxxabi = callPackage ./llvm.nix rec {
-    stdenv = rStdenv;
-    buildDocs = false; # No documentation to build
-    buildMan = false; # No man pages to build
-    targetName = "libcxxabi";
-    targetDir = "runtimes";
-
-    targetRuntimes = [
-      "libunwind"
-      targetName
-      "libcxx"
-    ];
-
-    extraCMakeFlags = [
-      "-DLIBCXXABI_INCLUDE_TESTS=ON"
-      "-DLIBCXXABI_USE_LLVM_UNWINDER=ON"
-      "-DLIBCXXABI_USE_COMPILER_RT=ON"
-
-      # Workaround having to build combined
-      "-DLIBUNWIND_INCLUDE_DOCS=OFF"
-      "-DLIBUNWIND_INCLUDE_TESTS=OFF"
-      "-DLIBUNWIND_USE_COMPILER_RT=ON"
-      "-DLIBUNWIND_INSTALL_LIBRARY=OFF"
-      "-DLIBUNWIND_INSTALL_HEADERS=OFF"
-      "-DLIBCXX_INCLUDE_DOCS=OFF"
-      "-DLIBCXX_INCLUDE_TESTS=OFF"
-      "-DLIBCXX_USE_COMPILER_RT=ON"
-      "-DLIBCXX_CXX_ABI=libcxxabi"
-      "-DLIBCXX_INSTALL_LIBRARY=OFF"
-      "-DLIBCXX_INSTALL_HEADERS=OFF"
-    ];
-  };
-
-  libcxx = callPackage ./llvm.nix rec {
-    stdenv = rStdenv;
-    buildMan = false; # No man pages to build
-    targetName = "libcxx";
-    targetDir = "runtimes";
-
-    targetRuntimes = [
-      "libunwind"
-      "libcxxabi"
-      targetName
-    ];
-
-    extraCMakeFlags = [
-      "-DLIBCXX_INCLUDE_DOCS=ON"
-      "-DLIBCXX_INCLUDE_TESTS=ON"
-      "-DLIBCXX_USE_COMPILER_RT=ON"
-      "-DLIBCXX_CXX_ABI=libcxxabi"
-
-      # Workaround having to build combined
-      "-DLIBUNWIND_INCLUDE_DOCS=OFF"
-      "-DLIBUNWIND_INCLUDE_TESTS=OFF"
-      "-DLIBUNWIND_USE_COMPILER_RT=ON"
-      "-DLIBUNWIND_INSTALL_LIBRARY=OFF"
-      "-DLIBUNWIND_INSTALL_HEADERS=OFF"
-      "-DLIBCXXABI_INCLUDE_TESTS=OFF"
-      "-DLIBCXXABI_USE_LLVM_UNWINDER=ON"
-      "-DLIBCXXABI_USE_COMPILER_RT=ON"
-      "-DLIBCXXABI_INSTALL_LIBRARY=OFF"
-      "-DLIBCXXABI_INSTALL_HEADERS=OFF"
-    ];
-
-    # Most of these can't find `bash` or `mkdir`, might just be hard-coded paths, or PATH is altered
-    extraPostPatch = ''
-      chmod +w -R ../libcxx/test/{libcxx,std}
-      cat ${./1000-libcxx-failing-tests.list} | xargs -d \\n rm
-    '';
-  };
-
-  compiler-rt = callPackage ./llvm.nix rec {
-    stdenv = rStdenv;
-    buildDocs = false; # No documentation to build
-    buildMan = false; # No man pages to build
-    targetName = "compiler-rt";
-    targetDir = "runtimes";
-
-    targetRuntimes = [
-      "libunwind"
-      "libcxxabi"
-      "libcxx"
-      targetName
-    ];
-
-    extraCMakeFlags = [
-      "-DCOMPILER_RT_INCLUDE_TESTS=ON"
-      "-DCOMPILER_RT_USE_LLVM_UNWINDER=ON"
-      "-DCOMPILER_RT_CXX_LIBRARY=libcxx"
-      "-DCOMPILER_RT_CAN_EXECUTE_TESTS=OFF" # We can't run most of these
-
-      # Workaround having to build combined
-      "-DLIBUNWIND_INCLUDE_DOCS=OFF"
-      "-DLIBUNWIND_INCLUDE_TESTS=OFF"
-      "-DLIBUNWIND_USE_COMPILER_RT=ON"
-      "-DLIBUNWIND_INSTALL_LIBRARY=OFF"
-      "-DLIBUNWIND_INSTALL_HEADERS=OFF"
-      "-DLIBCXXABI_INCLUDE_TESTS=OFF"
-      "-DLIBCXXABI_USE_LLVM_UNWINDER=ON"
-      "-DLIBCXXABI_USE_COMPILER_RT=ON"
-      "-DLIBCXXABI_INSTALL_LIBRARY=OFF"
-      "-DLIBCXXABI_INSTALL_HEADERS=OFF"
-      "-DLIBCXX_INCLUDE_DOCS=OFF"
-      "-DLIBCXX_INCLUDE_TESTS=OFF"
-      "-DLIBCXX_USE_COMPILER_RT=ON"
-      "-DLIBCXX_CXX_ABI=libcxxabi"
-      "-DLIBCXX_INSTALL_LIBRARY=OFF"
-      "-DLIBCXX_INSTALL_HEADERS=OFF"
-    ];
-
-    extraPostPatch = ''
-      # `No such file or directory: 'ldd'`
-      substituteInPlace ../compiler-rt/test/lit.common.cfg.py \
-        --replace "'ldd'," "'${glibc.bin}/bin/ldd',"
-
-      # We can run these
-      substituteInPlace ../compiler-rt/test/CMakeLists.txt \
-        --replace "endfunction()" "endfunction()''\nadd_subdirectory(builtins)''\nadd_subdirectory(shadowcallstack)"
-
-      # Could not launch llvm-config in /build/source/runtimes/build/bin
-      mkdir -p build/bin
-      ln -s ${llvm}/bin/llvm-config build/bin
-    '';
-
-    extraLicenses = [ lib.licenses.mit ];
-  };
-
-  # Stage 3
-  # Helpers
-  rocmClangStdenv = overrideCC stdenv clang;
-
-  clang = wrapCCWith rec {
-    inherit libcxx bintools;
-
-    # We do this to avoid HIP pathing problems, and mimic a monolithic install
-    cc = stdenv.mkDerivation (finalAttrs: {
-      inherit (clang-unwrapped) pname version;
-      dontUnpack = true;
-
-      installPhase = ''
-        runHook preInstall
-
-        clang_version=`${clang-unwrapped}/bin/clang -v 2>&1 | grep "clang version " | grep -E -o "[0-9.-]+"`
-        mkdir -p $out/{bin,include/c++/v1,lib/{cmake,clang/$clang_version/{include,lib}},libexec,share}
-
-        for path in ${llvm} ${clang-unwrapped} ${lld} ${libc} ${libunwind} ${libcxxabi} ${libcxx} ${compiler-rt}; do
-          cp -as $path/* $out
-          chmod +w $out/{*,include/c++/v1,lib/{clang/$clang_version/include,cmake}}
-          rm -f $out/lib/libc++.so
-        done
-
-        ln -s $out/lib/* $out/lib/clang/$clang_version/lib
-        ln -sf $out/include/* $out/lib/clang/$clang_version/include
-
-        runHook postInstall
-      '';
-
-      passthru.isClang = true;
-    });
-
-    extraPackages = [
-      llvm
-      lld
-      libc
-      libunwind
-      libcxxabi
-      compiler-rt
-    ];
-
-    nixSupport.cc-cflags = [
-      "-resource-dir=$out/resource-root"
-      "-fuse-ld=lld"
-      "-rtlib=compiler-rt"
-      "-unwindlib=libunwind"
-      "-Wno-unused-command-line-argument"
-    ];
-
-    extraBuildCommands = ''
-      clang_version=`${cc}/bin/clang -v 2>&1 | grep "clang version " | grep -E -o "[0-9.-]+"`
-      mkdir -p $out/resource-root
-      ln -s ${cc}/lib/clang/$clang_version/{include,lib} $out/resource-root
-
-      # Not sure why, but hardening seems to make things break
-      echo "" > $out/nix-support/add-hardening.sh
-
-      # GPU compilation uses builtin `lld`
-      substituteInPlace $out/bin/{clang,clang++} \
-        --replace "-MM) dontLink=1 ;;" "-MM | --cuda-device-only) dontLink=1 ;;''\n--cuda-host-only | --cuda-compile-host-device) dontLink=0 ;;"
-    '';
-  };
-
-  # Base
-  # Unfortunately, we cannot build `clang-tools-extra` separately.
-  clang-tools-extra = callPackage ./llvm.nix {
-    stdenv = rocmClangStdenv;
-    buildTests = false; # `invalid operands to binary expression ('std::basic_stringstream<char>' and 'const llvm::StringRef')`
-    targetName = "clang-tools-extra";
-
-    targetProjects = [
-      "clang"
-      "clang-tools-extra"
-    ];
-
-    extraBuildInputs = [ gtest ];
-
-    extraCMakeFlags = [
-      "-DLLVM_INCLUDE_DOCS=OFF"
-      "-DLLVM_INCLUDE_TESTS=OFF"
-      "-DCLANG_INCLUDE_DOCS=OFF"
-      "-DCLANG_INCLUDE_TESTS=ON"
-      "-DCLANG_TOOLS_EXTRA_INCLUDE_DOCS=ON"
-    ];
-
-    extraPostInstall = ''
-      # Remove LLVM and Clang
-      for path in `find ${llvm} ${clang-unwrapped}`; do
-        if [ $path != ${llvm} ] && [ $path != ${clang-unwrapped} ]; then
-          rm -f $out''${path#${llvm}} $out''${path#${clang-unwrapped}} || true
-        fi
-      done
-
-      # Cleanup empty directories
-      find $out -type d -empty -delete
-    '';
-
-    requiredSystemFeatures = [ "big-parallel" ];
-  };
-
-  # Projects
-  libclc = let
-    spirv = (spirv-llvm-translator.override { inherit llvm; });
-  in callPackage ./llvm.nix rec {
-    stdenv = rocmClangStdenv;
-    buildDocs = false; # No documentation to build
-    buildMan = false; # No man pages to build
-    targetName = "libclc";
-    targetDir = targetName;
-    extraBuildInputs = [ spirv ];
-
-    # `spirv-mesa3d` isn't compiling with LLVM 15.0.0, it does with LLVM 14.0.0
-    # Try removing the `spirv-mesa3d` and `clspv` patches next update
-    # `clspv` tests fail, unresolved calls
-    extraPostPatch = ''
-      substituteInPlace CMakeLists.txt \
-        --replace "find_program( LLVM_CLANG clang PATHS \''${LLVM_BINDIR} NO_DEFAULT_PATH )" \
-          "find_program( LLVM_CLANG clang PATHS \"${clang}/bin\" NO_DEFAULT_PATH )" \
-        --replace "find_program( LLVM_SPIRV llvm-spirv PATHS \''${LLVM_BINDIR} NO_DEFAULT_PATH )" \
-          "find_program( LLVM_SPIRV llvm-spirv PATHS \"${spirv}/bin\" NO_DEFAULT_PATH )" \
-        --replace "  spirv-mesa3d-" "" \
-        --replace "  spirv64-mesa3d-" "" \
-        --replace "NOT \''${t} MATCHES" \
-          "NOT \''${ARCH} STREQUAL \"clspv\" AND NOT \''${ARCH} STREQUAL \"clspv64\" AND NOT \''${t} MATCHES"
-    '';
-
-    checkTargets = [ ];
-    isBroken = true; # ROCm 5.7.0 doesn't have IR/AttributeMask.h yet...?
-  };
-
-  lldb = callPackage ./llvm.nix rec {
-    stdenv = rocmClangStdenv;
-    buildTests = false; # FIXME: Bad pathing for clang executable in tests, using relative path most likely
-    targetName = "lldb";
-    targetDir = targetName;
-    extraNativeBuildInputs = [ python3Packages.sphinx-automodapi ];
-
-    extraBuildInputs = [
-      xz
-      swig
-      lua5_3
-      gtest
-      graphviz
-    ];
-
-    extraCMakeFlags = [
-      "-DLLDB_EXTERNAL_CLANG_RESOURCE_DIR=${clang}/resource-root/lib/clang/$clang_version"
-      "-DLLDB_INCLUDE_TESTS=ON"
-      "-DLLDB_INCLUDE_UNITTESTS=ON"
-    ];
-
-    extraPostPatch = ''
-      export clang_version=`clang -v 2>&1 | grep "clang version " | grep -E -o "[0-9.-]+"`
-    '';
-
-    checkTargets = [ "check-${targetName}" ];
-  };
-
-  mlir = callPackage ./llvm.nix rec {
-    stdenv = rocmClangStdenv;
-    buildDocs = false; # No decent way to hack this to work
-    buildMan = false; # No man pages to build
-    targetName = "mlir";
-    targetDir = targetName;
-    extraNativeBuildInputs = [ hip ];
-
-    extraBuildInputs = [
-      rocm-comgr
-      vulkan-headers
-      vulkan-loader
-      glslang
-      shaderc
-    ];
-
-    extraCMakeFlags = [
-      "-DCMAKE_POLICY_DEFAULT_CMP0116=NEW"
-      "-DMLIR_INCLUDE_DOCS=ON"
-      "-DMLIR_INCLUDE_TESTS=ON"
-      "-DMLIR_ENABLE_ROCM_RUNNER=ON"
-      "-DMLIR_ENABLE_SPIRV_CPU_RUNNER=ON"
-      "-DMLIR_ENABLE_VULKAN_RUNNER=ON"
-      "-DROCM_TEST_CHIPSET=gfx000" # CPU runner
-    ];
-
-    extraPostPatch = ''
-      chmod +w ../llvm
-      mkdir -p ../llvm/build/bin
-      ln -s ${lit}/bin/lit ../llvm/build/bin/llvm-lit
-
-      substituteInPlace test/CMakeLists.txt \
-        --replace "FileCheck count not" "" \
-        --replace "list(APPEND MLIR_TEST_DEPENDS mlir_rocm_runtime)" ""
-
-      substituteInPlace lib/ExecutionEngine/CMakeLists.txt \
-        --replace "return()" ""
-
-      # Remove problematic tests
-      rm test/CAPI/execution_engine.c
-      rm test/Target/LLVMIR/llvmir-intrinsics.mlir
-      rm test/Target/LLVMIR/llvmir.mlir
-      rm test/Target/LLVMIR/openmp-llvm.mlir
-      rm test/mlir-cpu-runner/*.mlir
-      rm test/mlir-vulkan-runner/*.mlir
-    '';
-
-    extraPostInstall = ''
-      mkdir -p $out/bin
-      mv bin/mlir-tblgen $out/bin
-    '';
-
-    checkTargets = [ "check-${targetName}" ];
-    requiredSystemFeatures = [ "big-parallel" ];
-  };
-
-  polly = callPackage ./llvm.nix rec {
-    stdenv = rocmClangStdenv;
-    targetName = "polly";
-    targetDir = targetName;
-
-    extraPostPatch = ''
-      # `add_library cannot create target "llvm_gtest" because an imported target with the same name already exists`
-      substituteInPlace CMakeLists.txt \
-        --replace "NOT TARGET gtest" "FALSE"
-    '';
-
-    checkTargets = [ "check-${targetName}" ];
-  };
-
-  flang = callPackage ./llvm.nix rec {
-    stdenv = rocmClangStdenv;
-    buildTests = false; # `Executable "flang1" doesn't exist!`
-    targetName = "flang";
-    targetDir = targetName;
-    extraNativeBuildInputs = [ python3Packages.sphinx-markdown-tables ];
-    extraBuildInputs = [ mlir ];
-
-    extraCMakeFlags = [
-      "-DCMAKE_POLICY_DEFAULT_CMP0116=NEW"
-      "-DCLANG_DIR=${clang-unwrapped}/lib/cmake/clang"
-      "-DFLANG_INCLUDE_TESTS=OFF"
-      "-DMLIR_TABLEGEN_EXE=${mlir}/bin/mlir-tblgen"
-    ];
-
-    extraPostPatch = ''
-      substituteInPlace test/CMakeLists.txt \
-        --replace "FileCheck" "" \
-        --replace "count" "" \
-        --replace "not" ""
-
-      substituteInPlace docs/CMakeLists.txt \
-        --replace "CLANG_TABLEGEN_EXE clang-tblgen" "CLANG_TABLEGEN_EXE ${clang-unwrapped}/bin/clang-tblgen"
-    '';
-  };
-
-  openmp = callPackage ./llvm.nix rec {
-    stdenv = rocmClangStdenv;
-    buildTests = false; # Too many failures, most pass
-    targetName = "openmp";
-    targetDir = targetName;
-    extraPatches = [ ./0000-fix-openmp.patch ];
-    extraNativeBuildInputs = [ perl ];
-
-    extraBuildInputs = [
-      rocm-device-libs
-      rocm-runtime
-      elfutils
-    ];
-
-    extraCMakeFlags = [
-      "-DCMAKE_MODULE_PATH=/build/source/llvm/cmake/modules" # For docs
-      "-DCLANG_TOOL=${clang}/bin/clang"
-      "-DCLANG_OFFLOAD_BUNDLER_TOOL=${clang-unwrapped}/bin/clang-offload-bundler"
-      "-DOPENMP_LLVM_TOOLS_DIR=${llvm}/bin"
-      "-DOPENMP_LLVM_LIT_EXECUTABLE=${lit}/bin/.lit-wrapped"
-      "-DDEVICELIBS_ROOT=${rocm-device-libs.src}"
-    ];
-
-    extraPostPatch = ''
-      # We can't build this target at the moment
-      substituteInPlace libomptarget/DeviceRTL/CMakeLists.txt \
-        --replace "gfx1010" ""
-    '';
-
-    checkTargets = [ "check-${targetName}" ];
-    extraLicenses = [ lib.licenses.mit ];
-  };
-
-  # Runtimes
-  pstl = callPackage ./llvm.nix rec {
-    stdenv = rocmClangStdenv;
-    buildDocs = false; # No documentation to build
-    buildMan = false; # No man pages to build
-    buildTests = false; # Too many errors
-    targetName = "pstl";
-    targetDir = "runtimes";
-    targetRuntimes = [ targetName ];
-    checkTargets = [ "check-${targetName}" ];
-  };
-}
diff --git a/pkgs/development/rocm-modules/5/default.nix b/pkgs/development/rocm-modules/5/default.nix
new file mode 100644
index 000000000000..6509f8850858
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/default.nix
@@ -0,0 +1,9 @@
+{ callPackage
+, recurseIntoAttrs
+}:
+
+let
+  rocmUpdateScript = callPackage ./update.nix { };
+in {
+  llvm = recurseIntoAttrs (callPackage ./llvm/default.nix { inherit rocmUpdateScript; });
+}
diff --git a/pkgs/development/compilers/llvm/rocm/llvm.nix b/pkgs/development/rocm-modules/5/llvm/base.nix
similarity index 100%
rename from pkgs/development/compilers/llvm/rocm/llvm.nix
rename to pkgs/development/rocm-modules/5/llvm/base.nix
diff --git a/pkgs/development/rocm-modules/5/llvm/default.nix b/pkgs/development/rocm-modules/5/llvm/default.nix
new file mode 100644
index 000000000000..11f8241251e6
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/default.nix
@@ -0,0 +1,53 @@
+{ stdenv
+, callPackage
+, rocmUpdateScript
+, wrapBintoolsWith
+, overrideCC
+}:
+
+let
+  ## Stage 1 ##
+  # Projects
+  llvm = callPackage ./stage-1/llvm.nix { inherit rocmUpdateScript; };
+  clang-unwrapped = callPackage ./stage-1/clang-unwrapped.nix { inherit rocmUpdateScript llvm; };
+  lld = callPackage ./stage-1/lld.nix { inherit rocmUpdateScript llvm; };
+
+  # Runtimes
+  runtimes = callPackage ./stage-1/runtimes.nix { inherit rocmUpdateScript llvm; };
+
+  ## Stage 2 ##
+  # Helpers
+  bintools-unwrapped = callPackage ./stage-2/bintools-unwrapped.nix { inherit llvm lld; };
+  bintools = wrapBintoolsWith { bintools = bintools-unwrapped; };
+  rStdenv = callPackage ./stage-2/rstdenv.nix { inherit llvm clang-unwrapped lld runtimes bintools; };
+in rec {
+  inherit
+  llvm
+  clang-unwrapped
+  lld
+  bintools;
+
+  # Runtimes
+  libc = callPackage ./stage-2/libc.nix { inherit rocmUpdateScript; stdenv = rStdenv; };
+  libunwind = callPackage ./stage-2/libunwind.nix { inherit rocmUpdateScript; stdenv = rStdenv; };
+  libcxxabi = callPackage ./stage-2/libcxxabi.nix { inherit rocmUpdateScript; stdenv = rStdenv; };
+  libcxx = callPackage ./stage-2/libcxx.nix { inherit rocmUpdateScript; stdenv = rStdenv; };
+  compiler-rt = callPackage ./stage-2/compiler-rt.nix { inherit rocmUpdateScript llvm; stdenv = rStdenv; };
+
+  ## Stage 3 ##
+  # Helpers
+  clang = callPackage ./stage-3/clang.nix { inherit llvm lld clang-unwrapped bintools libc libunwind libcxxabi libcxx compiler-rt; };
+  rocmClangStdenv = overrideCC stdenv clang;
+
+  # Projects
+  clang-tools-extra = callPackage ./stage-3/clang-tools-extra.nix { inherit rocmUpdateScript llvm clang-unwrapped; stdenv = rocmClangStdenv; };
+  libclc = callPackage ./stage-3/libclc.nix { inherit rocmUpdateScript llvm clang; stdenv = rocmClangStdenv; };
+  lldb = callPackage ./stage-3/lldb.nix { inherit rocmUpdateScript clang; stdenv = rocmClangStdenv; };
+  mlir = callPackage ./stage-3/mlir.nix { inherit rocmUpdateScript; stdenv = rocmClangStdenv; };
+  polly = callPackage ./stage-3/polly.nix { inherit rocmUpdateScript; stdenv = rocmClangStdenv; };
+  flang = callPackage ./stage-3/flang.nix { inherit rocmUpdateScript clang-unwrapped mlir; stdenv = rocmClangStdenv; };
+  openmp = callPackage ./stage-3/openmp.nix { inherit rocmUpdateScript llvm clang-unwrapped clang; stdenv = rocmClangStdenv; };
+
+  # Runtimes
+  pstl = callPackage ./stage-3/pstl.nix { inherit rocmUpdateScript; stdenv = rocmClangStdenv; };
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-1/clang-unwrapped.nix b/pkgs/development/rocm-modules/5/llvm/stage-1/clang-unwrapped.nix
new file mode 100644
index 000000000000..113313f4e066
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-1/clang-unwrapped.nix
@@ -0,0 +1,46 @@
+{ callPackage
+, rocmUpdateScript
+, llvm
+}:
+
+callPackage ../base.nix rec {
+  inherit rocmUpdateScript;
+  targetName = "clang-unwrapped";
+  targetDir = "clang";
+  extraBuildInputs = [ llvm ];
+
+  extraCMakeFlags = [
+    "-DCLANG_INCLUDE_DOCS=ON"
+    "-DCLANG_INCLUDE_TESTS=ON"
+  ];
+
+  extraPostPatch = ''
+    # Looks like they forgot to add finding libedit to the standalone build
+    ln -s ../cmake/Modules/FindLibEdit.cmake cmake/modules
+
+    substituteInPlace CMakeLists.txt \
+      --replace "include(CheckIncludeFile)" "include(CheckIncludeFile)''\nfind_package(LibEdit)"
+
+    # `No such file or directory: '/build/source/clang/tools/scan-build/bin/scan-build'`
+    rm test/Analysis/scan-build/*.test
+    rm test/Analysis/scan-build/rebuild_index/rebuild_index.test
+
+    # `does not depend on a module exporting 'baz.h'`
+    rm test/Modules/header-attribs.cpp
+
+    # We do not have HIP or the ROCm stack available yet
+    rm test/Driver/hip-options.hip
+
+    # ???? `ld: cannot find crti.o: No such file or directory` linker issue?
+    rm test/Interpreter/dynamic-library.cpp
+
+    # `fatal error: 'stdio.h' file not found`
+    rm test/OpenMP/amdgcn_emit_llvm.c
+  '';
+
+  extraPostInstall = ''
+    mv bin/clang-tblgen $out/bin
+  '';
+
+  requiredSystemFeatures = [ "big-parallel" ];
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-1/lld.nix b/pkgs/development/rocm-modules/5/llvm/stage-1/lld.nix
new file mode 100644
index 000000000000..a7b042eabfe6
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-1/lld.nix
@@ -0,0 +1,13 @@
+{ callPackage
+, rocmUpdateScript
+, llvm
+}:
+
+callPackage ../base.nix rec {
+  inherit rocmUpdateScript;
+  buildMan = false; # No man pages to build
+  targetName = "lld";
+  targetDir = targetName;
+  extraBuildInputs = [ llvm ];
+  checkTargets = [ "check-${targetName}" ];
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-1/llvm.nix b/pkgs/development/rocm-modules/5/llvm/stage-1/llvm.nix
new file mode 100644
index 000000000000..51959ec8bc32
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-1/llvm.nix
@@ -0,0 +1,10 @@
+{ stdenv
+, callPackage
+, rocmUpdateScript
+}:
+
+callPackage ../base.nix {
+  inherit rocmUpdateScript;
+  requiredSystemFeatures = [ "big-parallel" ];
+  isBroken = stdenv.isAarch64; # https://github.com/RadeonOpenCompute/ROCm/issues/1831#issuecomment-1278205344
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-1/runtimes.nix b/pkgs/development/rocm-modules/5/llvm/stage-1/runtimes.nix
new file mode 100644
index 000000000000..5f6f278ab10e
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-1/runtimes.nix
@@ -0,0 +1,30 @@
+{ lib
+, callPackage
+, rocmUpdateScript
+, llvm
+}:
+
+callPackage ../base.nix rec {
+  inherit rocmUpdateScript;
+  buildDocs = false;
+  buildMan = false;
+  buildTests = false;
+  targetName = "runtimes";
+  targetDir = targetName;
+
+  targetRuntimes = [
+    "libunwind"
+    "libcxxabi"
+    "libcxx"
+    "compiler-rt"
+  ];
+
+  extraBuildInputs = [ llvm ];
+
+  extraCMakeFlags = [
+    "-DLIBCXX_INCLUDE_BENCHMARKS=OFF"
+    "-DLIBCXX_CXX_ABI=libcxxabi"
+  ];
+
+  extraLicenses = [ lib.licenses.mit ];
+}
diff --git a/pkgs/development/compilers/llvm/rocm/1000-libcxx-failing-tests.list b/pkgs/development/rocm-modules/5/llvm/stage-2/1000-libcxx-failing-tests.list
similarity index 100%
rename from pkgs/development/compilers/llvm/rocm/1000-libcxx-failing-tests.list
rename to pkgs/development/rocm-modules/5/llvm/stage-2/1000-libcxx-failing-tests.list
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-2/bintools-unwrapped.nix b/pkgs/development/rocm-modules/5/llvm/stage-2/bintools-unwrapped.nix
new file mode 100644
index 000000000000..ef40dd4d3824
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-2/bintools-unwrapped.nix
@@ -0,0 +1,28 @@
+{ runCommand
+, llvm
+, lld
+}:
+
+runCommand "rocm-llvm-binutils-${llvm.version}" { preferLocalBuild = true; } ''
+  mkdir -p $out/bin
+
+  for prog in ${lld}/bin/*; do
+    ln -s $prog $out/bin/$(basename $prog)
+  done
+
+  for prog in ${llvm}/bin/*; do
+    ln -sf $prog $out/bin/$(basename $prog)
+  done
+
+  ln -s ${llvm}/bin/llvm-ar $out/bin/ar
+  ln -s ${llvm}/bin/llvm-as $out/bin/as
+  ln -s ${llvm}/bin/llvm-dwp $out/bin/dwp
+  ln -s ${llvm}/bin/llvm-nm $out/bin/nm
+  ln -s ${llvm}/bin/llvm-objcopy $out/bin/objcopy
+  ln -s ${llvm}/bin/llvm-objdump $out/bin/objdump
+  ln -s ${llvm}/bin/llvm-ranlib $out/bin/ranlib
+  ln -s ${llvm}/bin/llvm-readelf $out/bin/readelf
+  ln -s ${llvm}/bin/llvm-size $out/bin/size
+  ln -s ${llvm}/bin/llvm-strip $out/bin/strip
+  ln -s ${lld}/bin/lld $out/bin/ld
+''
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-2/compiler-rt.nix b/pkgs/development/rocm-modules/5/llvm/stage-2/compiler-rt.nix
new file mode 100644
index 000000000000..3b8e41705e1a
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-2/compiler-rt.nix
@@ -0,0 +1,63 @@
+{ lib
+, stdenv
+, callPackage
+, rocmUpdateScript
+, llvm
+, glibc
+}:
+
+callPackage ../base.nix rec {
+  inherit stdenv rocmUpdateScript;
+  buildDocs = false; # No documentation to build
+  buildMan = false; # No man pages to build
+  targetName = "compiler-rt";
+  targetDir = "runtimes";
+
+  targetRuntimes = [
+    "libunwind"
+    "libcxxabi"
+    "libcxx"
+    targetName
+  ];
+
+  extraCMakeFlags = [
+    "-DCOMPILER_RT_INCLUDE_TESTS=ON"
+    "-DCOMPILER_RT_USE_LLVM_UNWINDER=ON"
+    "-DCOMPILER_RT_CXX_LIBRARY=libcxx"
+    "-DCOMPILER_RT_CAN_EXECUTE_TESTS=OFF" # We can't run most of these
+
+    # Workaround having to build combined
+    "-DLIBUNWIND_INCLUDE_DOCS=OFF"
+    "-DLIBUNWIND_INCLUDE_TESTS=OFF"
+    "-DLIBUNWIND_USE_COMPILER_RT=ON"
+    "-DLIBUNWIND_INSTALL_LIBRARY=OFF"
+    "-DLIBUNWIND_INSTALL_HEADERS=OFF"
+    "-DLIBCXXABI_INCLUDE_TESTS=OFF"
+    "-DLIBCXXABI_USE_LLVM_UNWINDER=ON"
+    "-DLIBCXXABI_USE_COMPILER_RT=ON"
+    "-DLIBCXXABI_INSTALL_LIBRARY=OFF"
+    "-DLIBCXXABI_INSTALL_HEADERS=OFF"
+    "-DLIBCXX_INCLUDE_DOCS=OFF"
+    "-DLIBCXX_INCLUDE_TESTS=OFF"
+    "-DLIBCXX_USE_COMPILER_RT=ON"
+    "-DLIBCXX_CXX_ABI=libcxxabi"
+    "-DLIBCXX_INSTALL_LIBRARY=OFF"
+    "-DLIBCXX_INSTALL_HEADERS=OFF"
+  ];
+
+  extraPostPatch = ''
+    # `No such file or directory: 'ldd'`
+    substituteInPlace ../compiler-rt/test/lit.common.cfg.py \
+      --replace "'ldd'," "'${glibc.bin}/bin/ldd',"
+
+    # We can run these
+    substituteInPlace ../compiler-rt/test/CMakeLists.txt \
+      --replace "endfunction()" "endfunction()''\nadd_subdirectory(builtins)''\nadd_subdirectory(shadowcallstack)"
+
+    # Could not launch llvm-config in /build/source/runtimes/build/bin
+    mkdir -p build/bin
+    ln -s ${llvm}/bin/llvm-config build/bin
+  '';
+
+  extraLicenses = [ lib.licenses.mit ];
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-2/libc.nix b/pkgs/development/rocm-modules/5/llvm/stage-2/libc.nix
new file mode 100644
index 000000000000..7e7cf9c2a608
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-2/libc.nix
@@ -0,0 +1,26 @@
+{ stdenv
+, callPackage
+, rocmUpdateScript
+}:
+
+callPackage ../base.nix rec {
+  inherit stdenv rocmUpdateScript;
+  buildMan = false; # No man pages to build
+  targetName = "libc";
+  targetDir = "runtimes";
+  targetRuntimes = [ targetName ];
+
+  extraPostPatch = ''
+    # `Failed to match ... against ...` `Match value not within tolerance value of MPFR result:`
+    # We need a better way, but I don't know enough sed magic and patching `CMakeLists.txt` isn't working...
+    substituteInPlace ../libc/test/src/math/log10_test.cpp \
+      --replace "i < N" "i < 0" \
+      --replace "test(mpfr::RoundingMode::Nearest);" "" \
+      --replace "test(mpfr::RoundingMode::Downward);" "" \
+      --replace "test(mpfr::RoundingMode::Upward);" "" \
+      --replace "test(mpfr::RoundingMode::TowardZero);" ""
+  '';
+
+  checkTargets = [ "check-${targetName}" ];
+  hardeningDisable = [ "fortify" ]; # Prevent `error: "Assumed value of MB_LEN_MAX wrong"`
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-2/libcxx.nix b/pkgs/development/rocm-modules/5/llvm/stage-2/libcxx.nix
new file mode 100644
index 000000000000..473227242765
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-2/libcxx.nix
@@ -0,0 +1,42 @@
+{ stdenv
+, callPackage
+, rocmUpdateScript
+}:
+
+callPackage ../base.nix rec {
+  inherit stdenv rocmUpdateScript;
+  buildMan = false; # No man pages to build
+  targetName = "libcxx";
+  targetDir = "runtimes";
+
+  targetRuntimes = [
+    "libunwind"
+    "libcxxabi"
+    targetName
+  ];
+
+  extraCMakeFlags = [
+    "-DLIBCXX_INCLUDE_DOCS=ON"
+    "-DLIBCXX_INCLUDE_TESTS=ON"
+    "-DLIBCXX_USE_COMPILER_RT=ON"
+    "-DLIBCXX_CXX_ABI=libcxxabi"
+
+    # Workaround having to build combined
+    "-DLIBUNWIND_INCLUDE_DOCS=OFF"
+    "-DLIBUNWIND_INCLUDE_TESTS=OFF"
+    "-DLIBUNWIND_USE_COMPILER_RT=ON"
+    "-DLIBUNWIND_INSTALL_LIBRARY=OFF"
+    "-DLIBUNWIND_INSTALL_HEADERS=OFF"
+    "-DLIBCXXABI_INCLUDE_TESTS=OFF"
+    "-DLIBCXXABI_USE_LLVM_UNWINDER=ON"
+    "-DLIBCXXABI_USE_COMPILER_RT=ON"
+    "-DLIBCXXABI_INSTALL_LIBRARY=OFF"
+    "-DLIBCXXABI_INSTALL_HEADERS=OFF"
+  ];
+
+  # Most of these can't find `bash` or `mkdir`, might just be hard-coded paths, or PATH is altered
+  extraPostPatch = ''
+    chmod +w -R ../libcxx/test/{libcxx,std}
+    cat ${./1000-libcxx-failing-tests.list} | xargs -d \\n rm
+  '';
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-2/libcxxabi.nix b/pkgs/development/rocm-modules/5/llvm/stage-2/libcxxabi.nix
new file mode 100644
index 000000000000..e15ec777ff61
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-2/libcxxabi.nix
@@ -0,0 +1,37 @@
+{ stdenv
+, callPackage
+, rocmUpdateScript
+}:
+
+callPackage ../base.nix rec {
+  inherit stdenv rocmUpdateScript;
+  buildDocs = false; # No documentation to build
+  buildMan = false; # No man pages to build
+  targetName = "libcxxabi";
+  targetDir = "runtimes";
+
+  targetRuntimes = [
+    "libunwind"
+    targetName
+    "libcxx"
+  ];
+
+  extraCMakeFlags = [
+    "-DLIBCXXABI_INCLUDE_TESTS=ON"
+    "-DLIBCXXABI_USE_LLVM_UNWINDER=ON"
+    "-DLIBCXXABI_USE_COMPILER_RT=ON"
+
+    # Workaround having to build combined
+    "-DLIBUNWIND_INCLUDE_DOCS=OFF"
+    "-DLIBUNWIND_INCLUDE_TESTS=OFF"
+    "-DLIBUNWIND_USE_COMPILER_RT=ON"
+    "-DLIBUNWIND_INSTALL_LIBRARY=OFF"
+    "-DLIBUNWIND_INSTALL_HEADERS=OFF"
+    "-DLIBCXX_INCLUDE_DOCS=OFF"
+    "-DLIBCXX_INCLUDE_TESTS=OFF"
+    "-DLIBCXX_USE_COMPILER_RT=ON"
+    "-DLIBCXX_CXX_ABI=libcxxabi"
+    "-DLIBCXX_INSTALL_LIBRARY=OFF"
+    "-DLIBCXX_INSTALL_HEADERS=OFF"
+  ];
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-2/libunwind.nix b/pkgs/development/rocm-modules/5/llvm/stage-2/libunwind.nix
new file mode 100644
index 000000000000..3d599e0d4b32
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-2/libunwind.nix
@@ -0,0 +1,26 @@
+{ stdenv
+, callPackage
+, rocmUpdateScript
+}:
+
+callPackage ../base.nix rec {
+  inherit stdenv rocmUpdateScript;
+  buildMan = false; # No man pages to build
+  targetName = "libunwind";
+  targetDir = "runtimes";
+  targetRuntimes = [ targetName ];
+
+  extraCMakeFlags = [
+    "-DLIBUNWIND_INCLUDE_DOCS=ON"
+    "-DLIBUNWIND_INCLUDE_TESTS=ON"
+    "-DLIBUNWIND_USE_COMPILER_RT=ON"
+  ];
+
+  extraPostPatch = ''
+    # `command had no output on stdout or stderr` (Says these unsupported tests)
+    chmod +w -R ../libunwind/test
+    rm ../libunwind/test/floatregister.pass.cpp
+    rm ../libunwind/test/unwind_leaffunction.pass.cpp
+    rm ../libunwind/test/libunwind_02.pass.cpp
+  '';
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-2/rstdenv.nix b/pkgs/development/rocm-modules/5/llvm/stage-2/rstdenv.nix
new file mode 100644
index 000000000000..45d369a6541c
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-2/rstdenv.nix
@@ -0,0 +1,35 @@
+{ stdenv
+, overrideCC
+, wrapCCWith
+, llvm
+, clang-unwrapped
+, lld
+, runtimes
+, bintools
+}:
+
+overrideCC stdenv (wrapCCWith rec {
+  inherit bintools;
+  libcxx = runtimes;
+  cc = clang-unwrapped;
+
+  extraPackages = [
+    llvm
+    lld
+  ];
+
+  nixSupport.cc-cflags = [
+    "-resource-dir=$out/resource-root"
+    "-fuse-ld=lld"
+    "-rtlib=compiler-rt"
+    "-unwindlib=libunwind"
+    "-Wno-unused-command-line-argument"
+  ];
+
+  extraBuildCommands = ''
+    clang_version=`${cc}/bin/clang -v 2>&1 | grep "clang version " | grep -E -o "[0-9.-]+"`
+    mkdir -p $out/resource-root
+    ln -s ${cc}/lib/clang/$clang_version/include $out/resource-root
+    ln -s ${runtimes}/lib $out/resource-root
+  '';
+})
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/clang-tools-extra.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/clang-tools-extra.nix
new file mode 100644
index 000000000000..d18673ecb3db
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/clang-tools-extra.nix
@@ -0,0 +1,42 @@
+{ stdenv
+, callPackage
+, rocmUpdateScript
+, llvm
+, clang-unwrapped
+, gtest
+}:
+
+callPackage ../base.nix rec {
+  inherit stdenv rocmUpdateScript;
+  buildTests = false; # `invalid operands to binary expression ('std::basic_stringstream<char>' and 'const llvm::StringRef')`
+  targetName = "clang-tools-extra";
+
+  targetProjects = [
+    "clang"
+    "clang-tools-extra"
+  ];
+
+  extraBuildInputs = [ gtest ];
+
+  extraCMakeFlags = [
+    "-DLLVM_INCLUDE_DOCS=OFF"
+    "-DLLVM_INCLUDE_TESTS=OFF"
+    "-DCLANG_INCLUDE_DOCS=OFF"
+    "-DCLANG_INCLUDE_TESTS=ON"
+    "-DCLANG_TOOLS_EXTRA_INCLUDE_DOCS=ON"
+  ];
+
+  extraPostInstall = ''
+    # Remove LLVM and Clang
+    for path in `find ${llvm} ${clang-unwrapped}`; do
+      if [ $path != ${llvm} ] && [ $path != ${clang-unwrapped} ]; then
+        rm -f $out''${path#${llvm}} $out''${path#${clang-unwrapped}} || true
+      fi
+    done
+
+    # Cleanup empty directories
+    find $out -type d -empty -delete
+  '';
+
+  requiredSystemFeatures = [ "big-parallel" ];
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/clang.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/clang.nix
new file mode 100644
index 000000000000..91f34265f85f
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/clang.nix
@@ -0,0 +1,73 @@
+{ stdenv
+, wrapCCWith
+, llvm
+, lld
+, clang-unwrapped
+, bintools
+, libc
+, libunwind
+, libcxxabi
+, libcxx
+, compiler-rt
+}:
+
+wrapCCWith rec {
+  inherit libcxx bintools;
+
+  # We do this to avoid HIP pathing problems, and mimic a monolithic install
+  cc = stdenv.mkDerivation (finalAttrs: {
+    inherit (clang-unwrapped) version;
+    pname = "rocm-llvm-clang";
+    dontUnpack = true;
+
+    installPhase = ''
+      runHook preInstall
+
+      clang_version=`${clang-unwrapped}/bin/clang -v 2>&1 | grep "clang version " | grep -E -o "[0-9.-]+"`
+      mkdir -p $out/{bin,include/c++/v1,lib/{cmake,clang/$clang_version/{include,lib}},libexec,share}
+
+      for path in ${llvm} ${clang-unwrapped} ${lld} ${libc} ${libunwind} ${libcxxabi} ${libcxx} ${compiler-rt}; do
+        cp -as $path/* $out
+        chmod +w $out/{*,include/c++/v1,lib/{clang/$clang_version/include,cmake}}
+        rm -f $out/lib/libc++.so
+      done
+
+      ln -s $out/lib/* $out/lib/clang/$clang_version/lib
+      ln -sf $out/include/* $out/lib/clang/$clang_version/include
+
+      runHook postInstall
+    '';
+
+    passthru.isClang = true;
+  });
+
+  extraPackages = [
+    llvm
+    lld
+    libc
+    libunwind
+    libcxxabi
+    compiler-rt
+  ];
+
+  nixSupport.cc-cflags = [
+    "-resource-dir=$out/resource-root"
+    "-fuse-ld=lld"
+    "-rtlib=compiler-rt"
+    "-unwindlib=libunwind"
+    "-Wno-unused-command-line-argument"
+  ];
+
+  extraBuildCommands = ''
+    clang_version=`${cc}/bin/clang -v 2>&1 | grep "clang version " | grep -E -o "[0-9.-]+"`
+    mkdir -p $out/resource-root
+    ln -s ${cc}/lib/clang/$clang_version/{include,lib} $out/resource-root
+
+    # Not sure why, but hardening seems to make things break
+    echo "" > $out/nix-support/add-hardening.sh
+
+    # GPU compilation uses builtin `lld`
+    substituteInPlace $out/bin/{clang,clang++} \
+      --replace "-MM) dontLink=1 ;;" "-MM | --cuda-device-only) dontLink=1 ;;''\n--cuda-host-only | --cuda-compile-host-device) dontLink=0 ;;"
+  '';
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/flang.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/flang.nix
new file mode 100644
index 000000000000..7289602451db
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/flang.nix
@@ -0,0 +1,33 @@
+{ stdenv
+, callPackage
+, rocmUpdateScript
+, clang-unwrapped
+, mlir
+, python3Packages
+}:
+
+callPackage ../base.nix rec {
+  inherit stdenv rocmUpdateScript;
+  buildTests = false; # `Executable "flang1" doesn't exist!`
+  targetName = "flang";
+  targetDir = targetName;
+  extraNativeBuildInputs = [ python3Packages.sphinx-markdown-tables ];
+  extraBuildInputs = [ mlir ];
+
+  extraCMakeFlags = [
+    "-DCMAKE_POLICY_DEFAULT_CMP0116=NEW"
+    "-DCLANG_DIR=${clang-unwrapped}/lib/cmake/clang"
+    "-DFLANG_INCLUDE_TESTS=OFF"
+    "-DMLIR_TABLEGEN_EXE=${mlir}/bin/mlir-tblgen"
+  ];
+
+  extraPostPatch = ''
+    substituteInPlace test/CMakeLists.txt \
+      --replace "FileCheck" "" \
+      --replace "count" "" \
+      --replace "not" ""
+
+    substituteInPlace docs/CMakeLists.txt \
+      --replace "CLANG_TABLEGEN_EXE clang-tblgen" "CLANG_TABLEGEN_EXE ${clang-unwrapped}/bin/clang-tblgen"
+  '';
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/libclc.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/libclc.nix
new file mode 100644
index 000000000000..1fd72ee67188
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/libclc.nix
@@ -0,0 +1,36 @@
+{ stdenv
+, callPackage
+, rocmUpdateScript
+, llvm
+, clang
+, spirv-llvm-translator
+}:
+
+let
+  spirv = (spirv-llvm-translator.override { inherit llvm; });
+in callPackage ../base.nix rec {
+  inherit stdenv rocmUpdateScript;
+  buildDocs = false; # No documentation to build
+  buildMan = false; # No man pages to build
+  targetName = "libclc";
+  targetDir = targetName;
+  extraBuildInputs = [ spirv ];
+
+  # `spirv-mesa3d` isn't compiling with LLVM 15.0.0, it does with LLVM 14.0.0
+  # Try removing the `spirv-mesa3d` and `clspv` patches next update
+  # `clspv` tests fail, unresolved calls
+  extraPostPatch = ''
+    substituteInPlace CMakeLists.txt \
+      --replace "find_program( LLVM_CLANG clang PATHS \''${LLVM_BINDIR} NO_DEFAULT_PATH )" \
+        "find_program( LLVM_CLANG clang PATHS \"${clang}/bin\" NO_DEFAULT_PATH )" \
+      --replace "find_program( LLVM_SPIRV llvm-spirv PATHS \''${LLVM_BINDIR} NO_DEFAULT_PATH )" \
+        "find_program( LLVM_SPIRV llvm-spirv PATHS \"${spirv}/bin\" NO_DEFAULT_PATH )" \
+      --replace "  spirv-mesa3d-" "" \
+      --replace "  spirv64-mesa3d-" "" \
+      --replace "NOT \''${t} MATCHES" \
+        "NOT \''${ARCH} STREQUAL \"clspv\" AND NOT \''${ARCH} STREQUAL \"clspv64\" AND NOT \''${t} MATCHES"
+  '';
+
+  checkTargets = [ ];
+  isBroken = true; # ROCm 5.7.0 doesn't have IR/AttributeMask.h yet...?
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/lldb.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/lldb.nix
new file mode 100644
index 000000000000..9b7d25e06d9d
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/lldb.nix
@@ -0,0 +1,39 @@
+{ stdenv
+, callPackage
+, rocmUpdateScript
+, clang
+, xz
+, swig
+, lua5_3
+, graphviz
+, gtest
+, python3Packages
+}:
+
+callPackage ../base.nix rec {
+  inherit stdenv rocmUpdateScript;
+  buildTests = false; # FIXME: Bad pathing for clang executable in tests, using relative path most likely
+  targetName = "lldb";
+  targetDir = targetName;
+  extraNativeBuildInputs = [ python3Packages.sphinx-automodapi ];
+
+  extraBuildInputs = [
+    xz
+    swig
+    lua5_3
+    graphviz
+    gtest
+  ];
+
+  extraCMakeFlags = [
+    "-DLLDB_EXTERNAL_CLANG_RESOURCE_DIR=${clang}/resource-root/lib/clang/$clang_version"
+    "-DLLDB_INCLUDE_TESTS=ON"
+    "-DLLDB_INCLUDE_UNITTESTS=ON"
+  ];
+
+  extraPostPatch = ''
+    export clang_version=`clang -v 2>&1 | grep "clang version " | grep -E -o "[0-9.-]+"`
+  '';
+
+  checkTargets = [ "check-${targetName}" ];
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/mlir.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/mlir.nix
new file mode 100644
index 000000000000..099622ca7cb8
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/mlir.nix
@@ -0,0 +1,67 @@
+{ stdenv
+, callPackage
+, rocmUpdateScript
+# , hip
+# , rocm-comgr
+, vulkan-headers
+, vulkan-loader
+, glslang
+, shaderc
+, lit
+}:
+
+callPackage ../base.nix rec {
+  inherit stdenv rocmUpdateScript;
+  buildDocs = false; # No decent way to hack this to work
+  buildMan = false; # No man pages to build
+  targetName = "mlir";
+  targetDir = targetName;
+  # extraNativeBuildInputs = [ hip ];
+
+  extraBuildInputs = [
+    # rocm-comgr
+    vulkan-headers
+    vulkan-loader
+    glslang
+    shaderc
+  ];
+
+  extraCMakeFlags = [
+    "-DCMAKE_POLICY_DEFAULT_CMP0116=NEW"
+    "-DMLIR_INCLUDE_DOCS=ON"
+    "-DMLIR_INCLUDE_TESTS=ON"
+    "-DMLIR_ENABLE_ROCM_RUNNER=ON"
+    "-DMLIR_ENABLE_SPIRV_CPU_RUNNER=ON"
+    "-DMLIR_ENABLE_VULKAN_RUNNER=ON"
+    "-DROCM_TEST_CHIPSET=gfx000" # CPU runner
+  ];
+
+  extraPostPatch = ''
+    chmod +w ../llvm
+    mkdir -p ../llvm/build/bin
+    ln -s ${lit}/bin/lit ../llvm/build/bin/llvm-lit
+
+    substituteInPlace test/CMakeLists.txt \
+      --replace "FileCheck count not" "" \
+      --replace "list(APPEND MLIR_TEST_DEPENDS mlir_rocm_runtime)" ""
+
+    substituteInPlace lib/ExecutionEngine/CMakeLists.txt \
+      --replace "return()" ""
+
+    # Remove problematic tests
+    rm test/CAPI/execution_engine.c
+    rm test/Target/LLVMIR/llvmir-intrinsics.mlir
+    rm test/Target/LLVMIR/llvmir.mlir
+    rm test/Target/LLVMIR/openmp-llvm.mlir
+    rm test/mlir-cpu-runner/*.mlir
+    rm test/mlir-vulkan-runner/*.mlir
+  '';
+
+  extraPostInstall = ''
+    mkdir -p $out/bin
+    mv bin/mlir-tblgen $out/bin
+  '';
+
+  checkTargets = [ "check-${targetName}" ];
+  requiredSystemFeatures = [ "big-parallel" ];
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/openmp.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/openmp.nix
new file mode 100644
index 000000000000..faab6388835e
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/openmp.nix
@@ -0,0 +1,45 @@
+{ lib
+, stdenv
+, callPackage
+, rocmUpdateScript
+, llvm
+, clang
+, clang-unwrapped
+# , rocm-device-libs
+# , rocm-runtime
+, perl
+, elfutils
+, lit
+}:
+
+callPackage ../base.nix rec {
+  inherit stdenv rocmUpdateScript;
+  buildTests = false; # Too many failures, most pass
+  targetName = "openmp";
+  targetDir = targetName;
+  extraNativeBuildInputs = [ perl ];
+
+  extraBuildInputs = [
+    # rocm-device-libs
+    # rocm-runtime
+    elfutils
+  ];
+
+  extraCMakeFlags = [
+    "-DCMAKE_MODULE_PATH=/build/source/llvm/cmake/modules" # For docs
+    "-DCLANG_TOOL=${clang}/bin/clang"
+    "-DCLANG_OFFLOAD_BUNDLER_TOOL=${clang-unwrapped}/bin/clang-offload-bundler"
+    "-DOPENMP_LLVM_TOOLS_DIR=${llvm}/bin"
+    "-DOPENMP_LLVM_LIT_EXECUTABLE=${lit}/bin/.lit-wrapped"
+    # "-DDEVICELIBS_ROOT=${rocm-device-libs.src}"
+  ];
+
+  extraPostPatch = ''
+    # We can't build this target at the moment
+    substituteInPlace libomptarget/DeviceRTL/CMakeLists.txt \
+      --replace "gfx1010" ""
+  '';
+
+  checkTargets = [ "check-${targetName}" ];
+  extraLicenses = [ lib.licenses.mit ];
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/polly.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/polly.nix
new file mode 100644
index 000000000000..e001f33dfd43
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/polly.nix
@@ -0,0 +1,18 @@
+{ stdenv
+, callPackage
+, rocmUpdateScript
+}:
+
+callPackage ../base.nix rec {
+  inherit stdenv rocmUpdateScript;
+  targetName = "polly";
+  targetDir = targetName;
+
+  extraPostPatch = ''
+    # `add_library cannot create target "llvm_gtest" because an imported target with the same name already exists`
+    substituteInPlace CMakeLists.txt \
+      --replace "NOT TARGET gtest" "FALSE"
+  '';
+
+  checkTargets = [ "check-${targetName}" ];
+}
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/pstl.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/pstl.nix
new file mode 100644
index 000000000000..dc7d7cd6ccbf
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/pstl.nix
@@ -0,0 +1,15 @@
+{ stdenv
+, callPackage
+, rocmUpdateScript
+}:
+
+callPackage ../base.nix rec {
+  inherit stdenv rocmUpdateScript;
+  buildDocs = false; # No documentation to build
+  buildMan = false; # No man pages to build
+  buildTests = false; # Too many errors
+  targetName = "pstl";
+  targetDir = "runtimes";
+  targetRuntimes = [ targetName ];
+  checkTargets = [ "check-${targetName}" ];
+}
diff --git a/pkgs/development/rocm-modules/update-script/default.nix b/pkgs/development/rocm-modules/5/update.nix
similarity index 91%
rename from pkgs/development/rocm-modules/update-script/default.nix
rename to pkgs/development/rocm-modules/5/update.nix
index 6188587de31a..abd434776ef9 100644
--- a/pkgs/development/rocm-modules/update-script/default.nix
+++ b/pkgs/development/rocm-modules/5/update.nix
@@ -12,7 +12,7 @@
 let
   pname =
     if lib.hasPrefix "rocm-llvm-" name
-    then "llvmPackages_rocm.${lib.removePrefix "rocm-llvm-" name}"
+    then "rocmPackages_5.llvm.${lib.removePrefix "rocm-llvm-" name}"
     else name;
 
   updateScript = writeScript "update.sh" ''
diff --git a/pkgs/test/default.nix b/pkgs/test/default.nix
index 05d8ee61e9a5..2b1768515bab 100644
--- a/pkgs/test/default.nix
+++ b/pkgs/test/default.nix
@@ -8,7 +8,7 @@ with pkgs;
     llvmTests = let
       pkgSets = lib.pipe pkgNames [
         (filter (lib.hasPrefix "llvmPackages"))
-        (filter (n: n != "llvmPackages_rocm"))
+        (filter (n: n != "rocmPackages.llvm"))
         (filter (n: n != "llvmPackages_latest"))
         (filter (n: n != "llvmPackages_git"))
       ];
diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index 123931d8533c..4e03557c7740 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -7766,6 +7766,9 @@ with pkgs;
 
   rar2fs = callPackage ../tools/filesystems/rar2fs { };
 
+  rocmPackages = rocmPackages_5;
+  rocmPackages_5 = recurseIntoAttrs (callPackage ../development/rocm-modules/5 { });
+
   rune = callPackage ../development/interpreters/rune { };
 
   s9fes = callPackage ../development/interpreters/s9fes { };
@@ -15716,7 +15719,6 @@ with pkgs;
   clangStdenv = if stdenv.cc.isClang then stdenv else lowPrio llvmPackages.stdenv;
   clang-sierraHack-stdenv = overrideCC stdenv buildPackages.clang-sierraHack;
   libcxxStdenv = if stdenv.isDarwin then stdenv else lowPrio llvmPackages.libcxxStdenv;
-  rocmClangStdenv = llvmPackages_rocm.rocmClangStdenv;
 
   clean = callPackage ../development/compilers/clean { };
 
@@ -16771,8 +16773,6 @@ with pkgs;
     targetLlvm = targetPackages.llvmPackages_16.llvm or llvmPackages_16.llvm;
   }));
 
-  llvmPackages_rocm = recurseIntoAttrs (callPackage ../development/compilers/llvm/rocm { });
-
   lorri = callPackage ../tools/misc/lorri {
     inherit (darwin.apple_sdk.frameworks) CoreServices Security;
   };
@@ -16958,7 +16958,7 @@ with pkgs;
   rml = callPackage ../development/compilers/rml { };
 
   composable_kernel = callPackage ../development/libraries/composable_kernel {
-    inherit (llvmPackages_rocm) openmp clang-tools-extra;
+    inherit (rocmPackages.llvm) openmp clang-tools-extra;
     stdenv = rocmClangStdenv;
   };
 
@@ -16977,17 +16977,17 @@ with pkgs;
   };
 
   hip-common = callPackage ../development/compilers/hip-common {
-    inherit (llvmPackages_rocm) llvm;
+    inherit (rocmPackages.llvm) llvm;
     stdenv = rocmClangStdenv;
   };
 
   hipcc = callPackage ../development/compilers/hipcc {
-    inherit (llvmPackages_rocm) llvm;
+    inherit (rocmPackages.llvm) llvm;
     stdenv = rocmClangStdenv;
   };
 
   hip = callPackage ../development/compilers/hip {
-    inherit (llvmPackages_rocm) llvm;
+    inherit (rocmPackages.llvm) llvm;
     inherit (cudaPackages) cudatoolkit;
     stdenv = rocmClangStdenv;
   };
@@ -17009,7 +17009,7 @@ with pkgs;
   };
 
   hipsparse = callPackage ../development/libraries/hipsparse {
-    inherit (llvmPackages_rocm) openmp;
+    inherit (rocmPackages.llvm) openmp;
     stdenv = rocmClangStdenv;
   };
 
@@ -17018,7 +17018,7 @@ with pkgs;
   };
 
   hipfft = callPackage ../development/libraries/hipfft {
-    inherit (llvmPackages_rocm) openmp;
+    inherit (rocmPackages.llvm) openmp;
     stdenv = rocmClangStdenv;
   };
 
@@ -17031,7 +17031,7 @@ with pkgs;
   };
 
   migraphx = callPackage ../development/libraries/migraphx {
-    inherit (llvmPackages_rocm) clang-tools-extra openmp;
+    inherit (rocmPackages.llvm) clang-tools-extra openmp;
     stdenv = rocmClangStdenv;
     rocmlir = rocmlir-rock;
   };
@@ -17049,7 +17049,7 @@ with pkgs;
   };
 
   rocalution = callPackage ../development/libraries/rocalution {
-    inherit (llvmPackages_rocm) openmp;
+    inherit (rocmPackages.llvm) openmp;
     stdenv = rocmClangStdenv;
   };
 
@@ -17122,7 +17122,7 @@ with pkgs;
   };
 
   rocfft = callPackage ../development/libraries/rocfft {
-    inherit (llvmPackages_rocm) openmp;
+    inherit (rocmPackages.llvm) openmp;
     stdenv = rocmClangStdenv;
   };
 
@@ -17135,12 +17135,12 @@ with pkgs;
   };
 
   rocwmma = callPackage ../development/libraries/rocwmma {
-    inherit (llvmPackages_rocm) openmp;
+    inherit (rocmPackages.llvm) openmp;
     stdenv = rocmClangStdenv;
   };
 
   rocblas = callPackage ../development/libraries/rocblas {
-    inherit (llvmPackages_rocm) openmp;
+    inherit (rocmPackages.llvm) openmp;
     stdenv = rocmClangStdenv;
   };
 
@@ -17153,7 +17153,7 @@ with pkgs;
   };
 
   miopen = callPackage ../development/libraries/miopen {
-    inherit (llvmPackages_rocm) llvm clang-tools-extra;
+    inherit (rocmPackages.llvm) llvm clang-tools-extra;
     stdenv = rocmClangStdenv;
     rocmlir = rocmlir-rock;
     boost = boost179.override { enableStatic = true; };
@@ -17167,11 +17167,9 @@ with pkgs;
     useOpenCL = true;
   };
 
-  rocmUpdateScript = callPackage ../development/rocm-modules/update-script { };
-
   # Requires GCC
   roctracer = callPackage ../development/libraries/roctracer {
-    inherit (llvmPackages_rocm) clang;
+    inherit (rocmPackages.llvm) clang;
   };
 
   rtags = callPackage ../development/tools/rtags {
@@ -39477,7 +39475,7 @@ with pkgs;
   lie = callPackage ../applications/science/math/LiE { };
 
   inherit (callPackage ../development/libraries/science/math/magma {
-    inherit (llvmPackages_rocm) openmp;
+    inherit (rocmPackages.llvm) openmp;
   }) magma magma_2_7_2 magma_2_6_2;
 
   magma-cuda = magma.override {
diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix
index d0e53a2a38c0..4521548cb3d8 100644
--- a/pkgs/top-level/python-packages.nix
+++ b/pkgs/top-level/python-packages.nix
@@ -8349,7 +8349,7 @@ self: super: with self; {
 
   open-meteo = callPackage ../development/python-modules/open-meteo { };
 
-  openai-triton = callPackage ../development/python-modules/openai-triton { llvmPackages = pkgs.llvmPackages_rocm; };
+  openai-triton = callPackage ../development/python-modules/openai-triton { llvmPackages = pkgs.rocmPackages.llvm; };
 
   openai-triton-bin = callPackage ../development/python-modules/openai-triton/bin.nix { };
 
@@ -13902,7 +13902,7 @@ self: super: with self; {
       else pkgs.magma;
     inherit (pkgs.darwin.apple_sdk.frameworks) Accelerate CoreServices;
     inherit (pkgs.darwin) libobjc;
-    inherit (pkgs.llvmPackages_rocm) openmp;
+    inherit (pkgs.rocmPackages.llvm) openmp;
   };
 
   torch-bin = callPackage ../development/python-modules/torch/bin.nix {

From 575ce47fa4051fbb5c234a10eed923d11257749a Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sun, 1 Oct 2023 20:05:32 -0500
Subject: [PATCH 12/30] rocm-related: move all relevant ROCm derivations to
 rocmPackages

clr: init at 5.7.0 (hipamd, opencl, rocclr merged)
---
 .../hip-common/0000-fixup-paths.patch         | 129 -------
 .../compilers/hip/0000-fixup-paths.patch      |  62 ---
 pkgs/development/compilers/hip/default.nix    | 197 ----------
 .../compilers/hipcc/0000-fixup-paths.patch    | 130 -------
 pkgs/development/libraries/rocclr/default.nix |  64 ---
 .../libraries/rocm-comgr/cmake.patch          | 365 ------------------
 .../libraries/rocm-opencl-icd/default.nix     |  26 --
 .../libraries/rocm-opencl-icd/test.nix        |  19 -
 .../libraries/rocm-opencl-runtime/default.nix |  69 ----
 ...0000-dont-require-hsa_amd_aqlprofile.patch |  20 -
 pkgs/development/libraries/ucx/default.nix    |   9 +-
 .../5}/clang-ocl/default.nix                  |   0
 .../rocm-modules/5/clr/default.nix            | 147 +++++++
 .../5}/composable_kernel/default.nix          |   0
 pkgs/development/rocm-modules/5/default.nix   | 237 +++++++++++-
 .../5}/hip-common/default.nix                 |  15 -
 .../5}/hipblas/default.nix                    |   2 +-
 .../5}/hipcc/default.nix                      |  30 +-
 .../5}/hipcub/default.nix                     |   2 +-
 .../5}/hipfft/default.nix                     |   2 +-
 .../5}/hipfort/default.nix                    |   0
 .../5}/hipify/default.nix                     |  11 +-
 .../5}/hipsolver/default.nix                  |   2 +-
 .../5}/hipsparse/default.nix                  |   2 +-
 .../5}/migraphx/default.nix                   |   2 +-
 .../5}/miopen/default.nix                     |   2 +-
 .../5}/miopen/deps.nix                        |   0
 .../5}/miopengemm/default.nix                 |   0
 .../5}/rccl/default.nix                       |   2 +-
 .../misc => rocm-modules/5}/rdc/default.nix   |   3 +-
 .../5}/rocalution/default.nix                 |   2 +-
 .../5}/rocblas/default.nix                    |   2 +-
 .../5}/rocdbgapi/default.nix                  |  11 +
 .../5}/rocfft/default.nix                     |   2 +-
 .../5}/rocfft/device-install.patch            |   0
 .../5}/rocfft/split-kernel-compilation.patch  |   0
 .../5}/rocgdb/default.nix                     |   0
 .../5}/rocm-cmake/default.nix                 |   2 +
 .../5}/rocm-comgr/default.nix                 |   1 -
 .../5}/rocm-core/default.nix                  |   0
 .../5}/rocm-device-libs/cmake.patch           |   0
 .../5}/rocm-device-libs/default.nix           |   0
 .../5}/rocm-runtime/default.nix               |   4 +-
 .../rocm-modules/5}/rocm-smi/cmake.patch      |   0
 .../rocm-modules/5}/rocm-smi/default.nix      |  20 +-
 .../5}/rocm-thunk/default.nix                 |   6 -
 .../5}/rocminfo/default.nix                   |   0
 .../5}/rocmlir/default.nix                    |   0
 .../5}/rocprim/default.nix                    |   2 +-
 .../5}/rocprofiler/default.nix                |  62 ++-
 .../5}/rocr-debug-agent/default.nix           |  14 +-
 .../5}/rocrand/default.nix                    |   2 +-
 .../5}/rocsolver/default.nix                  |   2 +-
 .../5}/rocsparse/default.nix                  |   2 +-
 .../5}/rocsparse/deps.nix                     |   0
 .../5}/rocthrust/default.nix                  |   2 +-
 .../5}/roctracer/default.nix                  |  11 +-
 .../rocwmma/0000-dont-fetch-googletest.patch  |   0
 .../5}/rocwmma/default.nix                    |   2 +-
 .../5}/tensile/default.nix                    |   0
 pkgs/development/rocm-modules/5/update.nix    |   4 +-
 pkgs/top-level/all-packages.nix               | 215 +----------
 62 files changed, 518 insertions(+), 1399 deletions(-)
 delete mode 100644 pkgs/development/compilers/hip-common/0000-fixup-paths.patch
 delete mode 100644 pkgs/development/compilers/hip/0000-fixup-paths.patch
 delete mode 100644 pkgs/development/compilers/hip/default.nix
 delete mode 100644 pkgs/development/compilers/hipcc/0000-fixup-paths.patch
 delete mode 100644 pkgs/development/libraries/rocclr/default.nix
 delete mode 100644 pkgs/development/libraries/rocm-comgr/cmake.patch
 delete mode 100644 pkgs/development/libraries/rocm-opencl-icd/default.nix
 delete mode 100644 pkgs/development/libraries/rocm-opencl-icd/test.nix
 delete mode 100644 pkgs/development/libraries/rocm-opencl-runtime/default.nix
 delete mode 100644 pkgs/development/libraries/rocprofiler/0000-dont-require-hsa_amd_aqlprofile.patch
 rename pkgs/development/{libraries => rocm-modules/5}/clang-ocl/default.nix (100%)
 create mode 100644 pkgs/development/rocm-modules/5/clr/default.nix
 rename pkgs/development/{libraries => rocm-modules/5}/composable_kernel/default.nix (100%)
 rename pkgs/development/{compilers => rocm-modules/5}/hip-common/default.nix (79%)
 rename pkgs/development/{libraries => rocm-modules/5}/hipblas/default.nix (97%)
 rename pkgs/development/{compilers => rocm-modules/5}/hipcc/default.nix (62%)
 rename pkgs/development/{libraries => rocm-modules/5}/hipcub/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/hipfft/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/hipfort/default.nix (100%)
 rename pkgs/development/{compilers => rocm-modules/5}/hipify/default.nix (83%)
 rename pkgs/development/{libraries => rocm-modules/5}/hipsolver/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/hipsparse/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/migraphx/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/miopen/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/miopen/deps.nix (100%)
 rename pkgs/development/{libraries => rocm-modules/5}/miopengemm/default.nix (100%)
 rename pkgs/development/{libraries => rocm-modules/5}/rccl/default.nix (98%)
 rename pkgs/development/{tools/misc => rocm-modules/5}/rdc/default.nix (94%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocalution/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocblas/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocdbgapi/default.nix (87%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocfft/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocfft/device-install.patch (100%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocfft/split-kernel-compilation.patch (100%)
 rename pkgs/development/{tools/misc => rocm-modules/5}/rocgdb/default.nix (100%)
 rename pkgs/development/{tools/build-managers => rocm-modules/5}/rocm-cmake/default.nix (91%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocm-comgr/default.nix (97%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocm-core/default.nix (100%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocm-device-libs/cmake.patch (100%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocm-device-libs/default.nix (100%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocm-runtime/default.nix (87%)
 rename pkgs/{tools/system => development/rocm-modules/5}/rocm-smi/cmake.patch (100%)
 rename pkgs/{tools/system => development/rocm-modules/5}/rocm-smi/default.nix (72%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocm-thunk/default.nix (92%)
 rename pkgs/development/{tools => rocm-modules/5}/rocminfo/default.nix (100%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocmlir/default.nix (100%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocprim/default.nix (98%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocprofiler/default.nix (52%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocr-debug-agent/default.nix (87%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocrand/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocsolver/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocsparse/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocsparse/deps.nix (100%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocthrust/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/roctracer/default.nix (93%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocwmma/0000-dont-fetch-googletest.patch (100%)
 rename pkgs/development/{libraries => rocm-modules/5}/rocwmma/default.nix (99%)
 rename pkgs/development/{libraries => rocm-modules/5}/tensile/default.nix (100%)

diff --git a/pkgs/development/compilers/hip-common/0000-fixup-paths.patch b/pkgs/development/compilers/hip-common/0000-fixup-paths.patch
deleted file mode 100644
index f3fd73255520..000000000000
--- a/pkgs/development/compilers/hip-common/0000-fixup-paths.patch
+++ /dev/null
@@ -1,129 +0,0 @@
-diff --git a/bin/hipcc.pl b/bin/hipcc.pl
-index da9559b..7aaa540 100755
---- a/bin/hipcc.pl
-+++ b/bin/hipcc.pl
-@@ -185,7 +185,7 @@ if ($HIP_PLATFORM eq "amd") {
-     chomp($HIP_CLANG_TARGET);
- 
-     if (! defined $HIP_CLANG_INCLUDE_PATH) {
--        $HIP_CLANG_INCLUDE_PATH = abs_path("$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/include");
-+        $HIP_CLANG_INCLUDE_PATH = abs_path("@clang@/resource-root/include");
-     }
-     if (! defined $HIP_INCLUDE_PATH) {
-         $HIP_INCLUDE_PATH = "$HIP_PATH/include";
-@@ -206,8 +206,8 @@ if ($HIP_PLATFORM eq "amd") {
-         print ("HIP_CLANG_TARGET=$HIP_CLANG_TARGET\n");
-     }
- 
--    $HIPCXXFLAGS .= " -isystem \"$HIP_CLANG_INCLUDE_PATH/..\"";
--    $HIPCFLAGS .= " -isystem \"$HIP_CLANG_INCLUDE_PATH/..\"";
-+    $HIPCXXFLAGS .= " -isystem \"$HIP_CLANG_INCLUDE_PATH\"";
-+    $HIPCFLAGS .= " -isystem \"$HIP_CLANG_INCLUDE_PATH\"";
-     $HIPLDFLAGS .= " -L\"$HIP_LIB_PATH\"";
-     if ($isWindows) {
-       $HIPLDFLAGS .= " -lamdhip64";
-@@ -625,7 +625,7 @@ if($HIP_PLATFORM eq "amd"){
-             $targetsStr = $ENV{HCC_AMDGPU_TARGET};
-         } elsif (not $isWindows) {
-             # Else try using rocm_agent_enumerator
--            $ROCM_AGENT_ENUM = "${ROCM_PATH}/bin/rocm_agent_enumerator";
-+            $ROCM_AGENT_ENUM = "@rocminfo@/bin/rocm_agent_enumerator";
-             $targetsStr = `${ROCM_AGENT_ENUM} -t GPU`;
-             $targetsStr =~ s/\n/,/g;
-         }
-@@ -724,16 +724,16 @@ if ($HIP_PLATFORM eq "amd") {
- 
-     if (not $isWindows  and not $compileOnly) {
-       if ($linkType eq 0) {
--        $toolArgs = " -L$HIP_LIB_PATH -lamdhip64 -L$ROCM_PATH/lib -lhsa-runtime64 -ldl -lnuma " . ${toolArgs};
-+        $toolArgs = " -L$HIP_LIB_PATH -lamdhip64 -L@rocm_runtime@/lib -lhsa-runtime64 -ldl -lnuma " . ${toolArgs};
-       } else {
-         $toolArgs = ${toolArgs} . " -Wl,-rpath=$HIP_LIB_PATH:$ROCM_PATH/lib -lamdhip64 ";
-       }
-       # To support __fp16 and _Float16, explicitly link with compiler-rt
--      $HIP_CLANG_BUILTIN_LIB="$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/lib/$HIP_CLANG_TARGET/libclang_rt.builtins.a";
-+      $HIP_CLANG_BUILTIN_LIB="@clang@/resource-root/lib/$HIP_CLANG_TARGET/libclang_rt.builtins.a";
-       if (-e $HIP_CLANG_BUILTIN_LIB) {
--        $toolArgs .= " -L$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/lib/$HIP_CLANG_TARGET -lclang_rt.builtins "
-+        $toolArgs .= " -L@clang@/resource-root/lib/$HIP_CLANG_TARGET -lclang_rt.builtins "
-       } else {
--        $toolArgs .= " -L$HIP_CLANG_PATH/../lib/clang/$HIP_CLANG_VERSION/lib/linux -lclang_rt.builtins-x86_64 "
-+        $toolArgs .= " -L@clang@/resource-root/lib/linux -lclang_rt.builtins-x86_64 "
-       }
-     }
- }
-diff --git a/bin/hipconfig.pl b/bin/hipconfig.pl
-index 5ddb8e9..6a76a2e 100755
---- a/bin/hipconfig.pl
-+++ b/bin/hipconfig.pl
-@@ -77,7 +77,7 @@ if ($HIP_COMPILER eq "clang") {
-     $CPP_CONFIG = " -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__=";
- 
-     $HIP_PATH_INCLUDE = $HIP_PATH."/include";
--    $HIP_CLANG_INCLUDE = $HIP_CLANG_PATH."/../lib/clang/".$HIP_CLANG_VERSION;
-+    $HIP_CLANG_INCLUDE = "@clang@/resource-root/include";
-     if($isWindows) {
-         $CPP_CONFIG .= " -I\"$HIP_PATH_INCLUDE\" -I\"$HIP_CLANG_INCLUDE\"";
-     } else {
-@@ -168,7 +168,7 @@ if (!$printed or $p_full) {
-             print ("HIP_CLANG_PATH   : $HIP_CLANG_PATH\n");
-             if ($isWindows) {
-                 system("\"$HIP_CLANG_PATH/clang++\" --version");
--                system("\"$HIP_CLANG_PATH/llc\" --version");
-+                system("\"@llvm@/bin/llc\" --version");
-                 printf("hip-clang-cxxflags : ");
-                 $win_output = `perl \"$HIP_PATH/bin/hipcc\" --cxxflags`;
-                 printf("$win_output \n");
-@@ -177,7 +177,7 @@ if (!$printed or $p_full) {
-                 printf("$win_output \n");
-             } else {
-                 system("$HIP_CLANG_PATH/clang++ --version");
--                system("$HIP_CLANG_PATH/llc --version");
-+                system("@llvm@/bin/llc --version");
-                 print ("hip-clang-cxxflags : ");
-                 system("$HIP_PATH/bin/hipcc --cxxflags");
-                 printf("\n");
-@@ -219,8 +219,8 @@ if (!$printed or $p_full) {
-         system ("uname -a");
-     }
- 
--    if (-e "/usr/bin/lsb_release") {
--        system ("/usr/bin/lsb_release -a");
-+    if (-e "@lsb_release@/bin/lsb_release") {
-+        system ("@lsb_release@/bin/lsb_release -a");
-     }
- 
-     print "\n" ;
-diff --git a/hip-lang-config.cmake.in b/hip-lang-config.cmake.in
-index 9250a68..f6e27b7 100644
---- a/hip-lang-config.cmake.in
-+++ b/hip-lang-config.cmake.in
-@@ -71,8 +71,8 @@ get_filename_component(_IMPORT_PREFIX "${_DIR}/../../../" REALPATH)
- 
- 
- #need _IMPORT_PREFIX to be set #FILE_REORG_BACKWARD_COMPATIBILITY
--file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS "${_IMPORT_PREFIX}/../llvm/lib/clang/*/include")
--file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS_REORG "${_IMPORT_PREFIX}/llvm/lib/clang/*/include")
-+file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS "@clang@/resource-root/include")
-+file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS_REORG "@clang@/resource-root/include")
- find_path(HIP_CLANG_INCLUDE_PATH __clang_cuda_math.h
-     HINTS ${HIP_CLANG_INCLUDE_SEARCH_PATHS}
-           ${HIP_CLANG_INCLUDE_SEARCH_PATHS_REORG}
-@@ -89,7 +89,7 @@ find_path(HSA_HEADER hsa/hsa.h
-   PATHS
-     "${_IMPORT_PREFIX}/../include" #FILE_REORG_BACKWARD_COMPATIBILITY
-     "${_IMPORT_PREFIX}/include"
--    "${ROCM_PATH}/include"
-+    "@rocm_runtime@/include"
- )
- 
- if (NOT HSA_HEADER)
-@@ -97,7 +97,7 @@ if (NOT HSA_HEADER)
- endif()
- 
- get_filename_component(HIP_COMPILER_INSTALL_PATH ${CMAKE_HIP_COMPILER} DIRECTORY)
--file(GLOB HIP_CLANGRT_LIB_SEARCH_PATHS "${HIP_COMPILER_INSTALL_PATH}/../lib/clang/*/lib/*")
-+file(GLOB HIP_CLANGRT_LIB_SEARCH_PATHS "@clang@/resource-root/lib/*")
- find_library(CLANGRT_BUILTINS
-     NAMES
-       clang_rt.builtins
diff --git a/pkgs/development/compilers/hip/0000-fixup-paths.patch b/pkgs/development/compilers/hip/0000-fixup-paths.patch
deleted file mode 100644
index 423857218ee7..000000000000
--- a/pkgs/development/compilers/hip/0000-fixup-paths.patch
+++ /dev/null
@@ -1,62 +0,0 @@
-diff --git a/hip-config.cmake.in b/hip-config.cmake.in
-index 89d1224..dc9ba05 100755
---- a/hip-config.cmake.in
-+++ b/hip-config.cmake.in
-@@ -142,7 +142,7 @@ if(HIP_COMPILER STREQUAL "clang")
-       file(TO_CMAKE_PATH "${HIP_PATH}/../lc" HIP_CLANG_ROOT)
-     endif()
-   else()
--    set(HIP_CLANG_ROOT "${ROCM_PATH}/llvm")
-+    set(HIP_CLANG_ROOT "@clang@")
-   endif()
-   if(NOT HIP_CXX_COMPILER)
-     set(HIP_CXX_COMPILER ${CMAKE_CXX_COMPILER})
-@@ -171,7 +171,7 @@ if(HIP_COMPILER STREQUAL "clang")
-     get_filename_component(_HIP_CLANG_BIN_PATH "${_HIP_CLANG_REAL_PATH}" DIRECTORY)
-     get_filename_component(HIP_CLANG_ROOT "${_HIP_CLANG_BIN_PATH}" DIRECTORY)
-   endif()
--  file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS ${HIP_CLANG_ROOT}/lib/clang/*/include)
-+  file(GLOB HIP_CLANG_INCLUDE_SEARCH_PATHS "@clang@/resource-root/include")
-   find_path(HIP_CLANG_INCLUDE_PATH stddef.h
-       HINTS
-           ${HIP_CLANG_INCLUDE_SEARCH_PATHS}
-@@ -209,7 +209,7 @@ if(NOT WIN32)
-       "${_IMPORT_PREFIX}/include"
-       #FILE_REORG_BACKWARD_COMPATIBILITY ${_IMPORT_PREFIX}/../include is for Backward compatibility
-       "${_IMPORT_PREFIX}/../include"
--      ${ROCM_PATH}/include
-+      "@rocm_runtime@/include"
-   )
- 
-   if (NOT HSA_HEADER)
-@@ -291,7 +291,7 @@ if(HIP_COMPILER STREQUAL "clang")
-     endif()
-   endif()
- 
--  file(GLOB HIP_CLANGRT_LIB_SEARCH_PATHS "${HIP_CLANG_ROOT}/lib/clang/*/lib/*")
-+  file(GLOB HIP_CLANGRT_LIB_SEARCH_PATHS "@clang@/resource-root/lib/*")
-   find_library(CLANGRT_BUILTINS
-     NAMES
-       clang_rt.builtins
-diff --git a/src/hip_embed_pch.sh b/src/hip_embed_pch.sh
-index 0a1572b..2feb19a 100755
---- a/src/hip_embed_pch.sh
-+++ b/src/hip_embed_pch.sh
-@@ -149,7 +149,7 @@ EOF
- 
-   $LLVM_DIR/bin/clang -cc1 -O3 -emit-pch -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -std=c++17 -fgnuc-version=4.2.1 -o $tmp/hip_wave64.pch -x hip-cpp-output - <$tmp/pch_wave64.cui &&
- 
--  $LLVM_DIR/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj &&
-+  @llvm@/bin/llvm-mc -o hip_pch.o $tmp/hip_pch.mcin --filetype=obj &&
- 
-   rm -rf $tmp
- }
-@@ -195,7 +195,7 @@ EOF
-   set -x
-   $LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib --hip-version=4.4 -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -D__HIPCC_RTC__ -x hip $tmp/hipRTC_header.h -E -o $tmp/hiprtc &&
-   cat $macroFile >> $tmp/hiprtc &&
--  $LLVM_DIR/bin/llvm-mc -o $tmp/hiprtc_header.o $tmp/hipRTC_header.mcin --filetype=obj &&
-+  @llvm@/bin/llvm-mc -o $tmp/hiprtc_header.o $tmp/hipRTC_header.mcin --filetype=obj &&
-   $LLVM_DIR/bin/clang $tmp/hiprtc_header.o -o $rtc_shared_lib_out -shared &&
-   $LLVM_DIR/bin/clang -O3 --rocm-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib -nogpuinc -emit-llvm -c -o $tmp/tmp.bc --cuda-device-only -D__HIPCC_RTC__ --offload-arch=gfx906 -x hip-cpp-output $tmp/hiprtc &&
-   rm -rf $tmp
diff --git a/pkgs/development/compilers/hip/default.nix b/pkgs/development/compilers/hip/default.nix
deleted file mode 100644
index 26fce1d8d490..000000000000
--- a/pkgs/development/compilers/hip/default.nix
+++ /dev/null
@@ -1,197 +0,0 @@
-{ lib
-, stdenv
-, fetchFromGitHub
-, fetchpatch
-, rocmUpdateScript
-, substituteAll
-, makeWrapper
-, hip-common
-, hipcc
-, rocclr
-, roctracer
-, cmake
-, perl
-, llvm
-, rocminfo
-, rocm-thunk
-, rocm-comgr
-, rocm-device-libs
-, rocm-runtime
-, rocm-opencl-runtime
-, cudatoolkit
-, numactl
-, libxml2
-, libX11
-, libglvnd
-, doxygen
-, graphviz
-, fontconfig
-, python3Packages
-, buildDocs ? true
-, buildTests ? false
-, useNVIDIA ? false
-}:
-
-let
-  hipPlatform = if useNVIDIA then "nvidia" else "amd";
-
-  wrapperArgs = [
-    "--prefix PATH : $out/bin"
-    "--prefix LD_LIBRARY_PATH : ${rocm-runtime}"
-    "--set HIP_PLATFORM ${hipPlatform}"
-    "--set HIP_PATH $out"
-    "--set HIP_CLANG_PATH ${stdenv.cc}/bin"
-    "--set DEVICE_LIB_PATH ${rocm-device-libs}/amdgcn/bitcode"
-    "--set HSA_PATH ${rocm-runtime}"
-    "--set ROCM_PATH $out"
-  ] ++ lib.optionals useNVIDIA [
-    "--set CUDA_PATH ${cudatoolkit}"
-  ];
-in stdenv.mkDerivation (finalAttrs: {
-  pname = "hip-${hipPlatform}";
-  version = "5.4.4";
-
-  outputs = [
-    "out"
-  ] ++ lib.optionals buildDocs [
-    "doc"
-  ];
-
-  src = fetchFromGitHub {
-    owner = "ROCm-Developer-Tools";
-    repo = "hipamd";
-    rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-FcuylhkG7HqLYXH1J6ND6IVEIbDzHp7h7jg2ZZ4XoFM=";
-  };
-
-  patches = [
-    (substituteAll {
-      src = ./0000-fixup-paths.patch;
-      inherit llvm;
-      clang = stdenv.cc;
-      rocm_runtime = rocm-runtime;
-    })
-
-    # https://github.com/ROCm-Developer-Tools/hipamd/commit/be33ec55acc104a59d01df5912261d007c7f3ee9
-    (fetchpatch {
-      url = "https://github.com/ROCm-Developer-Tools/hipamd/commit/be33ec55acc104a59d01df5912261d007c7f3ee9.patch";
-      hash = "sha256-eTC4mUIN1FwRce1n38uDOlITFL/vpcOhvnaZTo5R7lo=";
-    })
-  ];
-
-  nativeBuildInputs = [
-    makeWrapper
-    cmake
-    perl
-    python3Packages.python
-    python3Packages.cppheaderparser
-  ] ++ lib.optionals buildDocs [
-    doxygen
-    graphviz
-    fontconfig
-  ];
-
-  buildInputs = [
-    numactl
-    libxml2
-    libX11
-    libglvnd
-  ];
-
-  propagatedBuildInputs = [
-    stdenv.cc
-    llvm
-    rocminfo
-    rocm-thunk
-    rocm-comgr
-    rocm-device-libs
-    rocm-runtime
-    rocm-opencl-runtime
-  ] ++ lib.optionals useNVIDIA [
-    cudatoolkit
-  ];
-
-  cmakeFlags = [
-    "-DROCM_PATH=${rocminfo}"
-    "-DHIP_PLATFORM=${hipPlatform}"
-    "-DHIP_COMMON_DIR=${hip-common}"
-    "-DHIPCC_BIN_DIR=${hipcc}/bin"
-    "-DHIP_LLVM_ROOT=${stdenv.cc}"
-    "-DROCCLR_PATH=${rocclr}"
-    "-DAMD_OPENCL_PATH=${rocm-opencl-runtime.src}"
-    "-DPROF_API_HEADER_PATH=${roctracer.src}/inc/ext"
-    # Temporarily set variables to work around upstream CMakeLists issue
-    # Can be removed once https://github.com/ROCm-Developer-Tools/hipamd/issues/55 is fixed
-    "-DCMAKE_INSTALL_BINDIR=bin"
-    "-DCMAKE_INSTALL_INCLUDEDIR=include"
-    "-DCMAKE_INSTALL_LIBDIR=lib"
-  ] ++ lib.optionals buildTests [
-    "-DHIP_CATCH_TEST=1"
-  ];
-
-  postPatch = ''
-    export HIP_CLANG_PATH=${stdenv.cc}/bin
-    patchShebangs src
-  '' + lib.optionalString buildDocs ''
-    export HOME=$(mktemp -d)
-    export FONTCONFIG_FILE=${fontconfig.out}/etc/fonts/fonts.conf
-  '';
-
-  doCheck = buildTests;
-  checkTarget = "build_tests";
-
-  preCheck = lib.optionalString buildTests ''
-    export ROCM_PATH=$PWD
-    export DEVICE_LIB_PATH=${rocm-device-libs}/amdgcn/bitcode
-    patchShebangs bin
-  '';
-
-  postInstall = ''
-    patchShebangs $out/bin
-    cp -a $out/bin/hipcc $out/bin/hipcc-pl
-    cp -a $out/bin/hipconfig $out/bin/hipconfig-pl
-    wrapProgram $out/bin/hipcc --set HIP_USE_PERL_SCRIPTS 0
-    wrapProgram $out/bin/hipconfig --set HIP_USE_PERL_SCRIPTS 0
-    wrapProgram $out/bin/hipcc.bin ${lib.concatStringsSep " " wrapperArgs}
-    wrapProgram $out/bin/hipconfig.bin ${lib.concatStringsSep " " wrapperArgs}
-    wrapProgram $out/bin/hipcc-pl --set HIP_USE_PERL_SCRIPTS 1
-    wrapProgram $out/bin/hipconfig-pl --set HIP_USE_PERL_SCRIPTS 1
-    wrapProgram $out/bin/hipcc.pl ${lib.concatStringsSep " " wrapperArgs}
-    wrapProgram $out/bin/hipconfig.pl ${lib.concatStringsSep " " wrapperArgs}
-  '';
-
-  passthru = {
-    # All known and valid general GPU targets
-    # We cannot use this for each ROCm library, as each defines their own supported targets
-    # See: https://github.com/RadeonOpenCompute/ROCm/blob/77cbac4abab13046ee93d8b5bf410684caf91145/README.md#library-target-matrix
-    gpuTargets = lib.forEach [
-      "803"
-      "900"
-      "906"
-      "908"
-      "90a"
-      "1010"
-      "1012"
-      "1030"
-    ] (target: "gfx${target}");
-
-    updateScript = rocmUpdateScript {
-      name = finalAttrs.pname;
-      owner = finalAttrs.src.owner;
-      repo = finalAttrs.src.repo;
-    };
-  };
-
-  meta = with lib; {
-    description = "C++ Heterogeneous-Compute Interface for Portability specifically for AMD platform";
-    homepage = "https://github.com/ROCm-Developer-Tools/hipamd";
-    license = with licenses; [ mit ];
-    maintainers = with maintainers; [ lovesegfault ] ++ teams.rocm.members;
-    platforms = platforms.linux;
-    # Tests require GPU, also include issues
-    broken =
-      versions.minor finalAttrs.version != versions.minor hip-common.version ||
-      versions.minor finalAttrs.version != versions.minor hipcc.version ||
-      buildTests;
-  };
-})
diff --git a/pkgs/development/compilers/hipcc/0000-fixup-paths.patch b/pkgs/development/compilers/hipcc/0000-fixup-paths.patch
deleted file mode 100644
index 4f52c1ad11aa..000000000000
--- a/pkgs/development/compilers/hipcc/0000-fixup-paths.patch
+++ /dev/null
@@ -1,130 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index c21f247..5bd3e45 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -17,6 +17,6 @@ if (NOT WIN32) # C++17 does not require the std lib linking
-   target_link_libraries(hipconfig.bin ${LINK_LIBS} ) # for hipconfig.bin
- endif()
- 
--set(HIP_VERSION_MAJOR 4 PARENT_SCOPE)
--set(HIP_VERSION_MINOR 4 PARENT_SCOPE)
--set(HIP_VERSION_PATCH 4 PARENT_SCOPE)
-+set(HIP_VERSION_MAJOR @version_major@)
-+set(HIP_VERSION_MINOR @version_minor@)
-+set(HIP_VERSION_PATCH @version_patch@)
-diff --git a/src/hipBin_amd.h b/src/hipBin_amd.h
-index f94e4a5..f0b1b83 100644
---- a/src/hipBin_amd.h
-+++ b/src/hipBin_amd.h
-@@ -207,7 +207,7 @@ void HipBinAmd::initializeHipCXXFlags() {
-   hipClangIncludePath = getCompilerIncludePath();
-   hipCXXFlags += " -isystem \"" + hipClangIncludePath;
-   fs::path hipCXXFlagsTempFs = hipCXXFlags;
--  hipCXXFlagsTempFs /= "..\"";
-+  hipCXXFlagsTempFs /= "\"";
-   hipCXXFlags = hipCXXFlagsTempFs.string();
-   const EnvVariables& var = getEnvVariables();
-   // Allow __fp16 as function parameter and return type.
-@@ -266,7 +266,7 @@ void HipBinAmd::printCompilerInfo() const {
-     string cmd = hipClangPath + "/clang++ --version";
-     system(cmd.c_str());  // hipclang version
-     cout << "llc-version :" << endl;
--    cmd = hipClangPath + "/llc --version";
-+    cmd = "@llvm@/bin/llc --version";
-     system(cmd.c_str());  // llc version
-     cout << "hip-clang-cxxflags :" << endl;
-     cmd = hipPath + "/bin/hipcc  --cxxflags";
-@@ -278,7 +278,7 @@ void HipBinAmd::printCompilerInfo() const {
-   } else {
-     string cmd = hipClangPath + "/clang++ --version";
-     system(cmd.c_str());  // hipclang version
--    cmd = hipClangPath + "/llc --version";
-+    cmd = "@llvm@/bin/llc --version";
-     system(cmd.c_str());  // llc version
-     cout << "hip-clang-cxxflags :" << endl;
-     cmd = hipPath + "/bin/hipcc --cxxflags";
-@@ -331,10 +331,7 @@ string HipBinAmd::getCppConfig() {
-   hipPathInclude /= "include";
- 
-   const string& compilerPath = getCompilerPath();
--  hipClangInclude = compilerPath;
--  hipClangInclude = hipClangInclude.parent_path();
--  hipClangInclude /= "lib/clang/";
--  hipClangInclude /= compilerVersion;
-+  hipClangInclude = "@clang@/resource-root/include";
-   string hipClangPath = hipClangInclude.string();
- 
-   const OsType& osInfo = getOSInfo();
-@@ -442,17 +439,7 @@ string HipBinAmd::getHipCC() const {
- 
- 
- string HipBinAmd::getCompilerIncludePath() {
--  string hipClangVersion, includePath, compilerIncludePath;
--  const string& hipClangPath = getCompilerPath();
--  hipClangVersion = getCompilerVersion();
--  fs::path includePathfs = hipClangPath;
--  includePathfs = includePathfs.parent_path();
--  includePathfs /= "lib/clang/";
--  includePathfs /= hipClangVersion;
--  includePathfs /= "include";
--  includePathfs = fs::absolute(includePathfs).string();
--  compilerIncludePath = includePathfs.string();
--  return compilerIncludePath;
-+  return "@clang@/resource-root/include";
- }
- 
- 
-@@ -506,8 +493,8 @@ void HipBinAmd::printFull() {
-   cout << endl << "== Envirnoment Variables" << endl;
-   printEnvironmentVariables();
-   getSystemInfo();
--  if (fs::exists("/usr/bin/lsb_release"))
--    system("/usr/bin/lsb_release -a");
-+  if (fs::exists("@lsb_release@/bin/lsb_release"))
-+    system("@lsb_release@/bin/lsb_release -a");
-   cout << endl;
- }
- 
-@@ -993,7 +980,7 @@ void HipBinAmd::executeHipCCCmd(vector<string> argv) {
-     } else if (os != windows) {
-       // Else try using rocm_agent_enumerator
-       string ROCM_AGENT_ENUM;
--      ROCM_AGENT_ENUM = roccmPath + "/bin/rocm_agent_enumerator";
-+      ROCM_AGENT_ENUM = "@rocminfo@/bin/rocm_agent_enumerator";
-       targetsStr = ROCM_AGENT_ENUM +" -t GPU";
-       SystemCmdOut sysOut = hipBinUtilPtr_->exec(targetsStr.c_str());
-       regex toReplace("\n+");
-@@ -1097,7 +1084,7 @@ void HipBinAmd::executeHipCCCmd(vector<string> argv) {
-     string hipClangVersion, toolArgTemp;
-     if (linkType == 0) {
-       toolArgTemp = " -L"+ hipLibPath + "-lamdhip64 -L" +
--                      roccmPath+ "/lib -lhsa-runtime64 -ldl -lnuma " + toolArgs;
-+                      "@rocm_runtime@/lib -lhsa-runtime64 -ldl -lnuma " + toolArgs;
-       toolArgs = toolArgTemp;
-     } else {
-       toolArgTemp =  toolArgs + " -Wl,--enable-new-dtags -Wl,-rpath=" + hipLibPath + ":"
-@@ -1107,8 +1094,7 @@ void HipBinAmd::executeHipCCCmd(vector<string> argv) {
- 
-     hipClangVersion = getCompilerVersion();
-     // To support __fp16 and _Float16, explicitly link with compiler-rt
--    toolArgs += " -L" + hipClangPath + "/../lib/clang/" +
--                hipClangVersion + "/lib/linux -lclang_rt.builtins-x86_64 ";
-+    toolArgs += " -L@clang@/resource-root/lib/linux -lclang_rt.builtins-x86_64 ";
-   }
-   if (!var.hipccCompileFlagsAppendEnv_.empty()) {
-     HIPCXXFLAGS += " " + var.hipccCompileFlagsAppendEnv_ + " ";
-diff --git a/src/hipBin_nvidia.h b/src/hipBin_nvidia.h
-index 6feb315..b61739d 100644
---- a/src/hipBin_nvidia.h
-+++ b/src/hipBin_nvidia.h
-@@ -157,8 +157,8 @@ void HipBinNvidia::printFull() {
-   cout << endl << "== Envirnoment Variables" << endl;
-   printEnvironmentVariables();
-   getSystemInfo();
--  if (fs::exists("/usr/bin/lsb_release"))
--    system("/usr/bin/lsb_release -a");
-+  if (fs::exists("@lsb_release@/bin/lsb_release"))
-+    system("@lsb_release@/bin/lsb_release -a");
- }
- 
- // returns hip include
diff --git a/pkgs/development/libraries/rocclr/default.nix b/pkgs/development/libraries/rocclr/default.nix
deleted file mode 100644
index 09876ea98a91..000000000000
--- a/pkgs/development/libraries/rocclr/default.nix
+++ /dev/null
@@ -1,64 +0,0 @@
-{ lib
-, stdenv
-, fetchFromGitHub
-, fetchpatch
-, rocmUpdateScript
-, rocm-comgr
-}:
-
-stdenv.mkDerivation (finalAttrs: {
-  pname = "rocclr";
-  version = "5.4.4";
-
-  src = fetchFromGitHub {
-    owner = "ROCm-Developer-Tools";
-    repo = "ROCclr";
-    rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-DbN7kL8oyaPeYQB19Q96L3wX66v62TMSWl0Yor7Q4kE=";
-  };
-
-  patches = [
-    # Enable support for gfx8 again
-    # See the upstream issue: https://github.com/RadeonOpenCompute/ROCm/issues/1659
-    # And the arch patch: https://github.com/rocm-arch/rocm-arch/pull/742
-    (fetchpatch {
-      url = "https://raw.githubusercontent.com/John-Gee/rocm-arch/d6812d308fee3caf2b6bb01b4d19fe03a6a0e3bd/rocm-opencl-runtime/enable-gfx800.patch";
-      hash = "sha256-59jFDIIsTTZcNns9RyMVWPRUggn/bSlAGrky4quu8B4=";
-    })
-  ];
-
-  postPatch = ''
-    substituteInPlace device/comgrctx.cpp \
-      --replace "libamd_comgr.so" "${rocm-comgr}/lib/libamd_comgr.so"
-  '';
-
-  dontConfigure = true;
-  dontBuild = true;
-
-  installPhase = ''
-    runHook preInstall
-
-    mkdir -p $out
-    cp -a * $out/
-
-    runHook postInstall
-  '';
-
-  passthru.updateScript = rocmUpdateScript {
-    name = finalAttrs.pname;
-    owner = finalAttrs.src.owner;
-    repo = finalAttrs.src.repo;
-  };
-
-  meta = with lib; {
-    description = "Source package of the Radeon Open Compute common language runtime";
-    homepage = "https://github.com/ROCm-Developer-Tools/ROCclr";
-    license = licenses.mit;
-    maintainers = with maintainers; [ lovesegfault ] ++ teams.rocm.members;
-    # rocclr seems to have some AArch64 ifdefs, but does not seem
-    # to be supported yet by the build infrastructure. Recheck in
-    # the future.
-    platforms = [ "x86_64-linux" ];
-    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
-  };
-})
diff --git a/pkgs/development/libraries/rocm-comgr/cmake.patch b/pkgs/development/libraries/rocm-comgr/cmake.patch
deleted file mode 100644
index ae966745171c..000000000000
--- a/pkgs/development/libraries/rocm-comgr/cmake.patch
+++ /dev/null
@@ -1,365 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 62b857b..d21c7f4 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -147,8 +147,8 @@ if (UNIX)
-   list(APPEND AMD_COMGR_PUBLIC_LINKER_OPTIONS -pthread)
-   if (NOT APPLE AND COMGR_BUILD_SHARED_LIBS)
-     configure_file(
--      ${CMAKE_CURRENT_SOURCE_DIR}/src/exportmap.in
--      ${CMAKE_CURRENT_BINARY_DIR}/src/exportmap @ONLY)
-+      src/exportmap.in
-+      src/exportmap @ONLY)
-     list(APPEND AMD_COMGR_PRIVATE_LINKER_OPTIONS
-       "-Wl,--version-script=${CMAKE_CURRENT_BINARY_DIR}/src/exportmap")
-     # When building a shared library with -fsanitize=address we can't be
-@@ -175,10 +175,6 @@ endif()
- # the shared header.
- list(APPEND AMD_COMGR_PRIVATE_COMPILE_DEFINITIONS AMD_COMGR_EXPORT)
- 
--configure_file(
--  ${CMAKE_CURRENT_SOURCE_DIR}/include/amd_comgr.h.in
--  ${CMAKE_CURRENT_BINARY_DIR}/include/amd_comgr.h @ONLY)
--
- include(bc2h)
- include(opencl_pch)
- include(DeviceLibs)
-@@ -212,10 +208,14 @@ target_include_directories(amd_comgr
-     $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
-     $<INSTALL_INTERFACE:include>)
- 
-+configure_file(
-+  include/amd_comgr.h.in
-+  include/amd_comgr.h @ONLY)
-+
- set(AMD_COMGR_CONFIG_NAME amd_comgr-config.cmake)
- set(AMD_COMGR_TARGETS_NAME amd_comgr-targets.cmake)
- set(AMD_COMGR_VERSION_NAME amd_comgr-config-version.cmake)
--set(AMD_COMGR_PACKAGE_PREFIX ${CMAKE_INSTALL_LIBDIR}/cmake/amd_comgr)
-+set(AMD_COMGR_PACKAGE_PREFIX cmake/amd_comgr)
- 
- # Generate the build-tree package.
- set(AMD_COMGR_PREFIX_CODE)
-@@ -226,13 +226,13 @@ if (NOT COMGR_BUILD_SHARED_LIBS)
- endif()
- 
- set(AMD_COMGR_TARGETS_PATH
--  "${CMAKE_CURRENT_BINARY_DIR}/${AMD_COMGR_PACKAGE_PREFIX}/${AMD_COMGR_TARGETS_NAME}")
-+  "${CMAKE_CURRENT_BINARY_DIR}/lib/${AMD_COMGR_PACKAGE_PREFIX}/${AMD_COMGR_TARGETS_NAME}")
- set(AMD_COMGR_VERSION_PATH
--  "${CMAKE_CURRENT_BINARY_DIR}/${AMD_COMGR_PACKAGE_PREFIX}/${AMD_COMGR_VERSION_NAME}")
-+  "${CMAKE_CURRENT_BINARY_DIR}/lib/${AMD_COMGR_PACKAGE_PREFIX}/${AMD_COMGR_VERSION_NAME}")
- export(TARGETS amd_comgr
--  FILE "${AMD_COMGR_PACKAGE_PREFIX}/${AMD_COMGR_TARGETS_NAME}")
-+  FILE "lib/${AMD_COMGR_PACKAGE_PREFIX}/${AMD_COMGR_TARGETS_NAME}")
- configure_file("cmake/${AMD_COMGR_CONFIG_NAME}.in"
--  "${AMD_COMGR_PACKAGE_PREFIX}/${AMD_COMGR_CONFIG_NAME}"
-+  "lib/${AMD_COMGR_PACKAGE_PREFIX}/${AMD_COMGR_CONFIG_NAME}"
-   @ONLY)
- write_basic_package_version_file("${AMD_COMGR_VERSION_PATH}"
-   VERSION "${amd_comgr_VERSION}"
-@@ -266,7 +266,7 @@ install(FILES
- set(AMD_COMGR_PREFIX_CODE "
- # Derive absolute install prefix from config file path.
- get_filename_component(AMD_COMGR_PREFIX \"\${CMAKE_CURRENT_LIST_FILE}\" PATH)")
--string(REGEX REPLACE "/" ";" count "${AMD_COMGR_PACKAGE_PREFIX}")
-+string(REGEX REPLACE "/" ";" count "${CMAKE_INSTALL_LIBDIR}/${AMD_COMGR_PACKAGE_PREFIX}")
- foreach(p ${count})
-   set(AMD_COMGR_PREFIX_CODE "${AMD_COMGR_PREFIX_CODE}
- get_filename_component(AMD_COMGR_PREFIX \"\${AMD_COMGR_PREFIX}\" PATH)")
-@@ -278,20 +278,20 @@ if (NOT COMGR_BUILD_SHARED_LIBS)
-   string(APPEND AMD_COMGR_PREFIX_CODE "find_dependency(LLD REQUIRED)\n")
- endif()
- 
--set(AMD_COMGR_TARGETS_PATH "\${AMD_COMGR_PREFIX}/${AMD_COMGR_PACKAGE_PREFIX}/${AMD_COMGR_TARGETS_NAME}")
-+set(AMD_COMGR_TARGETS_PATH "\${AMD_COMGR_PREFIX}/${CMAKE_INSTALL_LIBDIR}/${AMD_COMGR_PACKAGE_PREFIX}/${AMD_COMGR_TARGETS_NAME}")
- configure_file("cmake/${AMD_COMGR_CONFIG_NAME}.in"
--  "${CMAKE_CURRENT_BINARY_DIR}/${AMD_COMGR_CONFIG_NAME}.install"
-+  "${AMD_COMGR_CONFIG_NAME}.install"
-   @ONLY)
- install(FILES
-   "${CMAKE_CURRENT_BINARY_DIR}/${AMD_COMGR_CONFIG_NAME}.install"
--  DESTINATION "${AMD_COMGR_PACKAGE_PREFIX}"
-+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/${AMD_COMGR_PACKAGE_PREFIX}"
-   RENAME "${AMD_COMGR_CONFIG_NAME}")
- install(EXPORT amd_comgr_export
--  DESTINATION "${AMD_COMGR_PACKAGE_PREFIX}"
-+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/${AMD_COMGR_PACKAGE_PREFIX}"
-   FILE "${AMD_COMGR_TARGETS_NAME}")
- install(FILES
-   "${AMD_COMGR_VERSION_PATH}"
--  DESTINATION "${AMD_COMGR_PACKAGE_PREFIX}")
-+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/${AMD_COMGR_PACKAGE_PREFIX}")
- 
- if(TARGET clangFrontendTool)
-   set(CLANG_LIBS
-diff --git a/cmake/DeviceLibs.cmake b/cmake/DeviceLibs.cmake
-index 27e9546..dfe1b57 100644
---- a/cmake/DeviceLibs.cmake
-+++ b/cmake/DeviceLibs.cmake
-@@ -1,8 +1,7 @@
- set(INC_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
- 
- set(GEN_LIBRARY_INC_FILE ${INC_DIR}/libraries.inc)
--
--file(WRITE ${GEN_LIBRARY_INC_FILE} "// Automatically generated file; DO NOT EDIT.\n")
-+set(GEN_LIBRARY_DEFS_INC_FILE ${INC_DIR}/libraries_defs.inc)
- 
- # cmake does not provide a way to query targets produced by a project,
- # so we have to make one up. Ordinarily, individual library target
-@@ -23,6 +22,7 @@ if(NOT AMD_DEVICE_LIBS_TARGETS)
-   message(FATAL_ERROR "Could not find list of device libraries")
- endif()
- 
-+set(TARGETS_INCLUDES "")
- foreach(AMDGCN_LIB_TARGET ${AMD_DEVICE_LIBS_TARGETS})
-   set(header ${AMDGCN_LIB_TARGET}.inc)
- 
-@@ -54,75 +54,52 @@ foreach(AMDGCN_LIB_TARGET ${AMD_DEVICE_LIBS_TARGETS})
-   add_custom_target(${AMDGCN_LIB_TARGET}_header DEPENDS ${INC_DIR}/${header})
-   add_dependencies(amd_comgr ${AMDGCN_LIB_TARGET}_header)
- 
--  file(APPEND ${GEN_LIBRARY_INC_FILE} "#include \"${header}\"\n")
-+  list(APPEND TARGETS_INCLUDES "#include \"${header}\"")
-+endforeach()
-+
-+list(JOIN TARGETS_INCLUDES "\n" TARGETS_INCLUDES)
-+file(GENERATE OUTPUT ${GEN_LIBRARY_INC_FILE} CONTENT "${TARGETS_INCLUDES}")
-+
-+foreach(OPENCL_VERSION 1.2 2.0)
-+  string(REPLACE . _ OPENCL_UNDERSCORE_VERSION ${OPENCL_VERSION})
-+  add_custom_command(OUTPUT ${INC_DIR}/opencl${OPENCL_VERSION}-c.inc
-+    COMMAND bc2h ${CMAKE_CURRENT_BINARY_DIR}/opencl${OPENCL_VERSION}-c.pch
-+                 ${INC_DIR}/opencl${OPENCL_VERSION}-c.inc
-+                 opencl${OPENCL_UNDERSCORE_VERSION}_c
-+    DEPENDS bc2h ${CMAKE_CURRENT_BINARY_DIR}/opencl${OPENCL_VERSION}-c.pch
-+    COMMENT "Generating opencl${OPENCL_VERSION}-c.inc"
-+  )
-+  set_property(DIRECTORY APPEND PROPERTY
-+    ADDITIONAL_MAKE_CLEAN_FILES ${INC_DIR}/opencl${OPENCL_VERSION}-c.inc)
-+  add_custom_target(opencl${OPENCL_VERSION}-c.inc_target DEPENDS ${INC_DIR}/opencl${OPENCL_VERSION}-c.inc)
-+  add_dependencies(amd_comgr opencl${OPENCL_VERSION}-c.inc_target)
- endforeach()
- 
--add_custom_command(OUTPUT ${INC_DIR}/opencl1.2-c.inc
--  COMMAND bc2h ${CMAKE_CURRENT_BINARY_DIR}/opencl1.2-c.pch
--               ${INC_DIR}/opencl1.2-c.inc
--               opencl1_2_c
--  DEPENDS bc2h ${CMAKE_CURRENT_BINARY_DIR}/opencl1.2-c.pch
--  COMMENT "Generating opencl1.2-c.inc"
--)
--set_property(DIRECTORY APPEND PROPERTY
--  ADDITIONAL_MAKE_CLEAN_FILES ${INC_DIR}/opencl1.2-c.inc)
--add_custom_target(opencl1.2-c.inc_target DEPENDS ${INC_DIR}/opencl1.2-c.inc)
--add_dependencies(amd_comgr opencl1.2-c.inc_target)
--file(APPEND ${GEN_LIBRARY_INC_FILE} "#include \"opencl1.2-c.inc\"\n")
--
--add_custom_command(OUTPUT ${INC_DIR}/opencl2.0-c.inc
--  COMMAND bc2h ${CMAKE_CURRENT_BINARY_DIR}/opencl2.0-c.pch
--               ${INC_DIR}/opencl2.0-c.inc
--               opencl2_0_c
--  DEPENDS bc2h ${CMAKE_CURRENT_BINARY_DIR}/opencl2.0-c.pch
--  COMMENT "Generating opencl2.0-c.inc"
--)
--set_property(DIRECTORY APPEND PROPERTY
--  ADDITIONAL_MAKE_CLEAN_FILES ${INC_DIR}/opencl2.0-c.inc)
--add_custom_target(opencl2.0-c.inc_target DEPENDS ${INC_DIR}/opencl2.0-c.inc)
--add_dependencies(amd_comgr opencl2.0-c.inc_target)
--file(APPEND ${GEN_LIBRARY_INC_FILE} "#include \"opencl2.0-c.inc\"\n")
--
--# Generate function to select libraries for a given GFXIP number.
--file(APPEND ${GEN_LIBRARY_INC_FILE} "#include \"llvm/ADT/StringRef.h\"\n")
--file(APPEND ${GEN_LIBRARY_INC_FILE}
--  "static std::tuple<const char*, const void*, size_t> get_oclc_isa_version(llvm::StringRef gfxip) {")
-+set(TARGETS_DEFS "")
-+list(APPEND TARGETS_DEFS "#ifndef AMD_DEVICE_LIBS_TARGET\n#define AMD_DEVICE_LIBS_TARGET(t)\n#endif")
-+list(APPEND TARGETS_DEFS "#ifndef AMD_DEVICE_LIBS_GFXIP\n#define AMD_DEVICE_LIBS_GFXIP(t, g)\n#endif")
-+list(APPEND TARGETS_DEFS "#ifndef AMD_DEVICE_LIBS_FUNCTION\n#define AMD_DEVICE_LIBS_FUNCTION(t, f)\n#endif")
-+list(APPEND TARGETS_DEFS "")
- foreach(AMDGCN_LIB_TARGET ${AMD_DEVICE_LIBS_TARGETS})
-+  list(APPEND TARGETS_DEFS "AMD_DEVICE_LIBS_TARGET(${AMDGCN_LIB_TARGET})")
-+  # Generate function to select libraries for a given GFXIP number.
-   if (${AMDGCN_LIB_TARGET} MATCHES "^oclc_isa_version_.+$")
-     string(REGEX REPLACE "^oclc_isa_version_(.+)$" "\\1" gfxip ${AMDGCN_LIB_TARGET})
--    file(APPEND ${GEN_LIBRARY_INC_FILE}
--      "if (gfxip == \"${gfxip}\") return std::make_tuple(\"${AMDGCN_LIB_TARGET}.bc\", ${AMDGCN_LIB_TARGET}_lib, ${AMDGCN_LIB_TARGET}_lib_size);")
-+    list(APPEND TARGETS_DEFS "AMD_DEVICE_LIBS_GFXIP(${AMDGCN_LIB_TARGET}, \"${gfxip}\")")
-   endif()
--endforeach()
--file(APPEND ${GEN_LIBRARY_INC_FILE}
--  "return std::make_tuple(nullptr, nullptr, 0); }")
--
--# Generate function to select libraries for given feature.
--foreach(AMDGCN_LIB_TARGET ${AMD_DEVICE_LIBS_TARGETS})
-+  # Generate function to select libraries for given feature.
-   if (${AMDGCN_LIB_TARGET} MATCHES "^oclc_.*_on$")
-     string(REGEX REPLACE "^oclc_(.*)_on" "\\1" function ${AMDGCN_LIB_TARGET})
--    file(APPEND ${GEN_LIBRARY_INC_FILE}
--      "static std::tuple<const char*, const void*, size_t> get_oclc_${function}(bool on) { \
--       return std::make_tuple( \
--         on ? \"oclc_${function}_on_lib.bc\" : \"oclc_${function}_off_lib.bc\", \
--         on ? oclc_${function}_on_lib : oclc_${function}_off_lib, \
--         on ? oclc_${function}_on_lib_size : oclc_${function}_off_lib_size \
--       ); }")
-+    list(APPEND TARGETS_DEFS "AMD_DEVICE_LIBS_FUNCTION(${AMDGCN_LIB_TARGET}, ${function})")
-   endif()
- endforeach()
- 
--# Generate function yield all libraries.
--file(APPEND ${GEN_LIBRARY_INC_FILE} "\n#include \"llvm/ADT/ArrayRef.h\"\n")
--file(APPEND ${GEN_LIBRARY_INC_FILE}
--  "llvm::ArrayRef<std::tuple<llvm::StringRef, llvm::StringRef>> COMGR::getDeviceLibraries() { \
--   static std::tuple<llvm::StringRef, llvm::StringRef> DeviceLibs[] = {")
--foreach(AMDGCN_LIB_TARGET ${AMD_DEVICE_LIBS_TARGETS})
--  file(APPEND ${GEN_LIBRARY_INC_FILE}
--    "{\"${AMDGCN_LIB_TARGET}.bc\", llvm::StringRef(reinterpret_cast<const char *>(${AMDGCN_LIB_TARGET}_lib), ${AMDGCN_LIB_TARGET}_lib_size)},")
--endforeach()
--file(APPEND ${GEN_LIBRARY_INC_FILE}
--  "}; \
--   return DeviceLibs; \
--   }")
-+list(APPEND TARGETS_DEFS "")
-+list(APPEND TARGETS_DEFS "#undef AMD_DEVICE_LIBS_TARGET")
-+list(APPEND TARGETS_DEFS "#undef AMD_DEVICE_LIBS_GFXIP")
-+list(APPEND TARGETS_DEFS "#undef AMD_DEVICE_LIBS_FUNCTION")
-+
-+list(JOIN TARGETS_DEFS "\n" TARGETS_DEFS)
-+file(GENERATE OUTPUT ${GEN_LIBRARY_DEFS_INC_FILE} CONTENT "${TARGETS_DEFS}")
- 
- include_directories(${INC_DIR})
-diff --git a/cmake/bc2h.cmake b/cmake/bc2h.cmake
-index 146fe2b..9134985 100644
---- a/cmake/bc2h.cmake
-+++ b/cmake/bc2h.cmake
-@@ -1,40 +1,41 @@
--file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/bc2h.c
--"#include <stdio.h>\n"
--"int main(int argc, char **argv){\n"
--"    FILE *ifp, *ofp;\n"
--"    int c, i, l;\n"
--"    if (argc != 4) return 1;\n"
--"    ifp = fopen(argv[1], \"rb\");\n"
--"    if (!ifp) return 1;\n"
--"    i = fseek(ifp, 0, SEEK_END);\n"
--"    if (i < 0) return 1;\n"
--"    l = ftell(ifp);\n"
--"    if (l < 0) return 1;\n"
--"    i = fseek(ifp, 0, SEEK_SET);\n"
--"    if (i < 0) return 1;\n"
--"    ofp = fopen(argv[2], \"wb+\");\n"
--"    if (!ofp) return 1;\n"
--"    fprintf(ofp, \"#define %s_size %d\\n\\n\"\n"
--"                 \"#if defined __GNUC__\\n\"\n"
--"                 \"__attribute__((aligned (4096)))\\n\"\n"
--"                 \"#elif defined _MSC_VER\\n\"\n"
--"                 \"__declspec(align(4096))\\n\"\n"
--"                 \"#endif\\n\"\n"
--"                 \"static const unsigned char %s[%s_size+1] = {\",\n"
--"                 argv[3], l,\n"
--"                 argv[3], argv[3]);\n"
--"    i = 0;\n"
--"    while ((c = getc(ifp)) != EOF) {\n"
--"        if (0 == (i&7)) fprintf(ofp, \"\\n   \");\n"
--"        fprintf(ofp, \" 0x%02x,\", c);\n"
--"        ++i;\n"
--"    }\n"
--"    fprintf(ofp, \" 0x00\\n};\\n\\n\");\n"
--"    fclose(ifp);\n"
--"    fclose(ofp);\n"
--"    return 0;\n"
--"}\n"
--)
-+file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/bc2h.c
-+  CONTENT
-+"#include <stdio.h>
-+int main(int argc, char **argv){
-+    FILE *ifp, *ofp;
-+    int c, i, l;
-+    if (argc != 4) return 1;
-+    ifp = fopen(argv[1], \"rb\");
-+    if (!ifp) return 1;
-+    i = fseek(ifp, 0, SEEK_END);
-+    if (i < 0) return 1;
-+    l = ftell(ifp);
-+    if (l < 0) return 1;
-+    i = fseek(ifp, 0, SEEK_SET);
-+    if (i < 0) return 1;
-+    ofp = fopen(argv[2], \"wb+\");
-+    if (!ofp) return 1;
-+    fprintf(ofp, \"#define %s_size %d\\n\\n\"
-+                 \"#if defined __GNUC__\\n\"
-+                 \"__attribute__((aligned (4096)))\\n\"
-+                 \"#elif defined _MSC_VER\\n\"
-+                 \"__declspec(align(4096))\\n\"
-+                 \"#endif\\n\"
-+                 \"static const unsigned char %s[%s_size+1] = {\",
-+                 argv[3], l,
-+                 argv[3], argv[3]);
-+    i = 0;
-+    while ((c = getc(ifp)) != EOF) {
-+        if (0 == (i&7)) fprintf(ofp, \"\\n   \");
-+        fprintf(ofp, \" 0x%02x,\", c);
-+        ++i;
-+    }
-+    fprintf(ofp, \" 0x00\\n};\\n\\n\");
-+    fclose(ifp);
-+    fclose(ofp);
-+    return 0;
-+}
-+")
- 
- add_executable(bc2h ${CMAKE_CURRENT_BINARY_DIR}/bc2h.c)
- if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-diff --git a/src/comgr-device-libs.cpp b/src/comgr-device-libs.cpp
-index 4d2b914..80786d1 100644
---- a/src/comgr-device-libs.cpp
-+++ b/src/comgr-device-libs.cpp
-@@ -35,7 +35,7 @@
- 
- #include "comgr-device-libs.h"
- #include "comgr.h"
--#include "libraries.inc"
-+#include "comgr-libraries.h"
- #include "llvm/ADT/StringSwitch.h"
- #include <cstdint>
- 
-diff --git a/src/comgr-libraries.h b/src/comgr-libraries.h
-new file mode 100644
-index 0000000..3caa0a0
---- /dev/null
-+++ b/src/comgr-libraries.h
-@@ -0,0 +1,34 @@
-+#include "libraries.inc"
-+#include "opencl1.2-c.inc"
-+#include "opencl2.0-c.inc"
-+#include "llvm/ADT/StringRef.h"
-+#include "llvm/ADT/ArrayRef.h"
-+
-+static std::tuple<const char*, const void*, size_t> get_oclc_isa_version(llvm::StringRef gfxip) {
-+#define AMD_DEVICE_LIBS_GFXIP(target, target_gfxip) \
-+  if (gfxip == target_gfxip) return std::make_tuple(#target ".bc", target##_lib, target##_lib_size);
-+#include "libraries_defs.inc"
-+
-+  return std::make_tuple(nullptr, nullptr, 0);
-+}
-+
-+#define AMD_DEVICE_LIBS_FUNCTION(target, function) \
-+  static std::tuple<const char*, const void*, size_t> get_oclc_##function(bool on) { \
-+    return std::make_tuple( \
-+      on ? "oclc_" #function "_on_lib.bc" : "oclc_" #function "_off_lib.bc", \
-+      on ? oclc_##function##_on_lib : oclc_##function##_off_lib, \
-+      on ? oclc_##function##_on_lib_size : oclc_##function##_off_lib_size \
-+    ); \
-+  }
-+#include "libraries_defs.inc"
-+
-+llvm::ArrayRef<std::tuple<llvm::StringRef, llvm::StringRef>> COMGR::getDeviceLibraries() {
-+  static std::tuple<llvm::StringRef, llvm::StringRef> DeviceLibs[] = {
-+#define AMD_DEVICE_LIBS_TARGET(target) \
-+    {#target ".bc", llvm::StringRef(reinterpret_cast<const char *>(target##_lib), target##_lib_size)},
-+#include "libraries_defs.inc"
-+  };
-+  return DeviceLibs;
-+}
-+
-+
diff --git a/pkgs/development/libraries/rocm-opencl-icd/default.nix b/pkgs/development/libraries/rocm-opencl-icd/default.nix
deleted file mode 100644
index 4602d3646e47..000000000000
--- a/pkgs/development/libraries/rocm-opencl-icd/default.nix
+++ /dev/null
@@ -1,26 +0,0 @@
-{ lib
-, stdenv
-, callPackage
-, rocm-opencl-runtime
-}:
-
-stdenv.mkDerivation rec {
-  pname = "rocm-opencl-icd";
-  version = rocm-opencl-runtime.version;
-
-  dontUnpack = true;
-
-  installPhase = ''
-    mkdir -p $out/etc/OpenCL/vendors
-    echo "${rocm-opencl-runtime}/lib/libamdocl64.so" > $out/etc/OpenCL/vendors/amdocl64.icd
-  '';
-
-  passthru.impureTests = { rocm-opencl = callPackage ./test.nix { }; };
-
-  meta = with lib; {
-    description = "OpenCL ICD definition for AMD GPUs using the ROCm stack";
-    license = licenses.mit;
-    maintainers = with maintainers; [ lovesegfault ] ++ teams.rocm.members;
-    platforms = platforms.linux;
-  };
-}
diff --git a/pkgs/development/libraries/rocm-opencl-icd/test.nix b/pkgs/development/libraries/rocm-opencl-icd/test.nix
deleted file mode 100644
index 398a4818e7c3..000000000000
--- a/pkgs/development/libraries/rocm-opencl-icd/test.nix
+++ /dev/null
@@ -1,19 +0,0 @@
-{ lib, makeImpureTest, clinfo, rocm-opencl-icd, rocm-smi }:
-makeImpureTest {
-  name = "rocm-opencl";
-  testedPackage = "rocm-opencl-icd";
-
-  nativeBuildInputs = [ clinfo rocm-smi ];
-
-  OCL_ICD_VENDORS = "${rocm-opencl-icd}/etc/OpenCL/vendors/";
-
-  testScript = ''
-    # Test fails if the number of platforms is 0
-    clinfo | grep -E 'Number of platforms * [1-9]'
-    rocm-smi | grep -A1 GPU
-  '';
-
-  meta = with lib; {
-    maintainers = teams.rocm.members;
-  };
-}
diff --git a/pkgs/development/libraries/rocm-opencl-runtime/default.nix b/pkgs/development/libraries/rocm-opencl-runtime/default.nix
deleted file mode 100644
index ebdb4e3177d7..000000000000
--- a/pkgs/development/libraries/rocm-opencl-runtime/default.nix
+++ /dev/null
@@ -1,69 +0,0 @@
-{ lib
-, stdenv
-, fetchFromGitHub
-, rocmUpdateScript
-, addOpenGLRunpath
-, cmake
-, rocm-comgr
-, rocm-runtime
-, rocclr
-, glew
-, libX11
-, numactl
-}:
-
-stdenv.mkDerivation (finalAttrs: {
-  pname = "rocm-opencl-runtime";
-  version = "5.4.4";
-
-  src = fetchFromGitHub {
-    owner = "RadeonOpenCompute";
-    repo = "ROCm-OpenCL-Runtime";
-    rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-E1+Y/fgp5b+7H1LN+O1fwVi0/XRCgvsiSxTY3u/q+8I=";
-  };
-
-  nativeBuildInputs = [ cmake ];
-
-  buildInputs = [
-    rocm-comgr
-    rocm-runtime
-    glew
-    libX11
-    numactl
-  ];
-
-  cmakeFlags = [
-    "-DAMD_OPENCL_PATH=${finalAttrs.src}"
-    "-DROCCLR_PATH=${rocclr}"
-  ];
-
-  dontStrip = true;
-
-  # Remove clinfo, which is already provided through the
-  # `clinfo` package.
-  postInstall = ''
-    rm -rf $out/bin
-  '';
-
-  # Fix the ICD installation path for NixOS
-  postPatch = ''
-    substituteInPlace khronos/icd/loader/linux/icd_linux.c \
-      --replace 'ICD_VENDOR_PATH' '"${addOpenGLRunpath.driverLink}/etc/OpenCL/vendors/"'
-  '';
-
-  passthru.updateScript = rocmUpdateScript {
-    name = finalAttrs.pname;
-    owner = finalAttrs.src.owner;
-    repo = finalAttrs.src.repo;
-  };
-
-  meta = with lib; {
-    description = "OpenCL runtime for AMD GPUs, part of the ROCm stack";
-    homepage = "https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime";
-    license = with licenses; [ asl20 mit ];
-    maintainers = with maintainers; [ acowley lovesegfault ] ++ teams.rocm.members;
-    platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
-  };
-})
diff --git a/pkgs/development/libraries/rocprofiler/0000-dont-require-hsa_amd_aqlprofile.patch b/pkgs/development/libraries/rocprofiler/0000-dont-require-hsa_amd_aqlprofile.patch
deleted file mode 100644
index b70163b08e48..000000000000
--- a/pkgs/development/libraries/rocprofiler/0000-dont-require-hsa_amd_aqlprofile.patch
+++ /dev/null
@@ -1,20 +0,0 @@
-diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp
-index 643ff16..c08d98f 100644
---- a/src/util/hsa_rsrc_factory.cpp
-+++ b/src/util/hsa_rsrc_factory.cpp
-@@ -127,15 +127,6 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize
-   if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR);
-   if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR);
-
--  // Get AqlProfile API table
--  aqlprofile_api_ = {0};
--#ifdef ROCP_LD_AQLPROFILE
--  status = LoadAqlProfileLib(&aqlprofile_api_);
--#else
--  status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_);
--#endif
--  CHECK_STATUS("aqlprofile API table load failed", status);
--
-   // Get Loader API table
-   loader_api_ = {0};
-   status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_);
diff --git a/pkgs/development/libraries/ucx/default.nix b/pkgs/development/libraries/ucx/default.nix
index a6dfb9f85c80..c956a3333a03 100644
--- a/pkgs/development/libraries/ucx/default.nix
+++ b/pkgs/development/libraries/ucx/default.nix
@@ -4,7 +4,7 @@
 , enableCuda ? config.cudaSupport
 , cudatoolkit
 , enableRocm ? false
-, rocm-core, rocm-runtime, rocm-device-libs, hip
+, rocmPackages
 }:
 
 let
@@ -13,9 +13,12 @@ let
     inherit (cudatoolkit) name meta;
     paths = [ cudatoolkit cudatoolkit.lib ];
   };
+
+  rocmList = with rocmPackages; [ rocm-core rocm-runtime rocm-device-libs clr ];
+
   rocm = symlinkJoin {
     name = "rocm";
-    paths = [ rocm-core rocm-runtime rocm-device-libs hip ];
+    paths = rocmList;
   };
 
 in
@@ -40,7 +43,7 @@ stdenv.mkDerivation rec {
     rdma-core
     zlib
   ] ++ lib.optional enableCuda cudatoolkit
-  ++ lib.optionals enableRocm [ rocm-core rocm-runtime rocm-device-libs hip ];
+  ++ lib.optionals enableRocm rocmList;
 
   configureFlags = [
     "--with-rdmacm=${lib.getDev rdma-core}"
diff --git a/pkgs/development/libraries/clang-ocl/default.nix b/pkgs/development/rocm-modules/5/clang-ocl/default.nix
similarity index 100%
rename from pkgs/development/libraries/clang-ocl/default.nix
rename to pkgs/development/rocm-modules/5/clang-ocl/default.nix
diff --git a/pkgs/development/rocm-modules/5/clr/default.nix b/pkgs/development/rocm-modules/5/clr/default.nix
new file mode 100644
index 000000000000..d3f811dcc422
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/clr/default.nix
@@ -0,0 +1,147 @@
+{ lib
+, stdenv
+, fetchFromGitHub
+, rocmUpdateScript
+, makeWrapper
+, cmake
+, perl
+, clang
+, hip-common
+, hipcc
+, rocm-device-libs
+, rocm-comgr
+, rocm-runtime
+, roctracer
+, rocminfo
+, numactl
+, libGL
+, libxml2
+, libX11
+, python3Packages
+}:
+
+let
+  wrapperArgs = [
+    "--prefix PATH : $out/bin"
+    "--prefix LD_LIBRARY_PATH : ${rocm-runtime}"
+    "--set HIP_PLATFORM amd"
+    "--set HIP_PATH $out"
+    "--set HIP_CLANG_PATH ${clang}/bin"
+    "--set DEVICE_LIB_PATH ${rocm-device-libs}/amdgcn/bitcode"
+    "--set HSA_PATH ${rocm-runtime}"
+    "--set ROCM_PATH $out"
+  ];
+in stdenv.mkDerivation (finalAttrs: {
+  pname = "clr";
+  version = "5.7.0";
+
+  src = fetchFromGitHub {
+    owner = "ROCm-Developer-Tools";
+    repo = "clr";
+    rev = "rocm-${finalAttrs.version}";
+    hash = "sha256-C+rFW/7kf35rz0sQTI2+iY5RhZZQY07fc5a+e6cB5OQ=";
+  };
+
+  nativeBuildInputs = [
+    makeWrapper
+    cmake
+    perl
+    python3Packages.python
+    python3Packages.cppheaderparser
+  ];
+
+  buildInputs = [
+    numactl
+    libGL
+    libxml2
+    libX11
+  ];
+
+  propagatedBuildInputs = [
+    rocm-device-libs
+    rocm-comgr
+    rocm-runtime
+    rocminfo
+  ];
+
+  cmakeFlags = [
+    "-DCMAKE_POLICY_DEFAULT_CMP0072=NEW" # Prefer newer OpenGL libraries
+    "-DCLR_BUILD_HIP=ON"
+    "-DCLR_BUILD_OCL=ON"
+    "-DHIP_COMMON_DIR=${hip-common}"
+    "-DHIPCC_BIN_DIR=${hipcc}/bin"
+    "-DHIP_PLATFORM=amd"
+    "-DPROF_API_HEADER_PATH=${roctracer.src}/inc/ext"
+    "-DROCM_PATH=${rocminfo}"
+
+    # Temporarily set variables to work around upstream CMakeLists issue
+    # Can be removed once https://github.com/ROCm-Developer-Tools/hipamd/issues/55 is fixed
+    "-DCMAKE_INSTALL_BINDIR=bin"
+    "-DCMAKE_INSTALL_INCLUDEDIR=include"
+    "-DCMAKE_INSTALL_LIBDIR=lib"
+  ];
+
+  postPatch = ''
+    patchShebangs hipamd/src
+
+    # We're not on Windows so these are never installed to hipcc...
+    substituteInPlace hipamd/CMakeLists.txt \
+      --replace "install(PROGRAMS \''${HIPCC_BIN_DIR}/hipcc.bat DESTINATION bin)" "" \
+      --replace "install(PROGRAMS \''${HIPCC_BIN_DIR}/hipconfig.bat DESTINATION bin)" ""
+
+    substituteInPlace hipamd/src/hip_embed_pch.sh \
+      --replace "\''$LLVM_DIR/bin/clang" "${clang}/bin/clang"
+  '';
+
+  postInstall = ''
+    patchShebangs $out/bin
+
+    # hipcc.bin and hipconfig.bin is mysteriously never installed
+    cp -a ${hipcc}/bin/{hipcc.bin,hipconfig.bin} $out/bin
+
+    wrapProgram $out/bin/hipcc.bin ${lib.concatStringsSep " " wrapperArgs}
+    wrapProgram $out/bin/hipconfig.bin ${lib.concatStringsSep " " wrapperArgs}
+    wrapProgram $out/bin/hipcc.pl ${lib.concatStringsSep " " wrapperArgs}
+    wrapProgram $out/bin/hipconfig.pl ${lib.concatStringsSep " " wrapperArgs}
+
+    # Just link rocminfo, it's easier
+    ln -s ${rocminfo}/bin/* $out/bin
+  '';
+
+  passthru = {
+    # All known and valid general GPU targets
+    # We cannot use this for each ROCm library, as each defines their own supported targets
+    # See: https://github.com/RadeonOpenCompute/ROCm/blob/77cbac4abab13046ee93d8b5bf410684caf91145/README.md#library-target-matrix
+    gpuTargets = lib.forEach [
+      "803"
+      "900"
+      "906"
+      "908"
+      "90a"
+      "940"
+      "941"
+      "942"
+      "1010"
+      "1012"
+      "1030"
+      "1100"
+      "1101"
+      "1102"
+    ] (target: "gfx${target}");
+
+    updateScript = rocmUpdateScript {
+      name = finalAttrs.pname;
+      owner = finalAttrs.src.owner;
+      repo = finalAttrs.src.repo;
+    };
+  };
+
+  meta = with lib; {
+    description = "AMD Common Language Runtime for hipamd, opencl, and rocclr";
+    homepage = "https://github.com/ROCm-Developer-Tools/clr";
+    license = with licenses; [ mit ];
+    maintainers = with maintainers; [ lovesegfault ] ++ teams.rocm.members;
+    platforms = platforms.linux;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
+  };
+})
diff --git a/pkgs/development/libraries/composable_kernel/default.nix b/pkgs/development/rocm-modules/5/composable_kernel/default.nix
similarity index 100%
rename from pkgs/development/libraries/composable_kernel/default.nix
rename to pkgs/development/rocm-modules/5/composable_kernel/default.nix
diff --git a/pkgs/development/rocm-modules/5/default.nix b/pkgs/development/rocm-modules/5/default.nix
index 6509f8850858..8bc496f452f1 100644
--- a/pkgs/development/rocm-modules/5/default.nix
+++ b/pkgs/development/rocm-modules/5/default.nix
@@ -1,9 +1,244 @@
 { callPackage
 , recurseIntoAttrs
+, cudaPackages
+, python3Packages
+, elfutils
+, boost179
 }:
 
 let
   rocmUpdateScript = callPackage ./update.nix { };
-in {
+in rec {
+  ## RadeonOpenCompute ##
   llvm = recurseIntoAttrs (callPackage ./llvm/default.nix { inherit rocmUpdateScript; });
+
+  rocm-core = callPackage ./rocm-core {
+    inherit rocmUpdateScript;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocm-cmake = callPackage ./rocm-cmake {
+    inherit rocmUpdateScript;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocm-thunk = callPackage ./rocm-thunk {
+    inherit rocmUpdateScript;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocm-smi = python3Packages.callPackage ./rocm-smi {
+    inherit rocmUpdateScript;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  # Eventually will be in the LLVM repo
+  rocm-device-libs = callPackage ./rocm-device-libs {
+    inherit rocmUpdateScript rocm-cmake;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocm-runtime = callPackage ./rocm-runtime {
+    inherit rocmUpdateScript rocm-device-libs rocm-thunk;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  # Eventually will be in the LLVM repo
+  rocm-comgr = callPackage ./rocm-comgr {
+    inherit rocmUpdateScript rocm-cmake rocm-device-libs;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocminfo = callPackage ./rocminfo {
+    inherit rocmUpdateScript rocm-cmake rocm-runtime;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  clang-ocl = callPackage ./clang-ocl {
+    inherit rocmUpdateScript rocm-cmake rocm-device-libs;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  # Broken, too many errors
+  rdc = callPackage ./rdc {
+    inherit rocmUpdateScript rocm-smi rocm-runtime;
+    # stdenv = llvm.rocmClangStdenv;
+  };
+
+  ## ROCm-Developer-Tools ##
+  hip-common = callPackage ./hip-common {
+    inherit rocmUpdateScript;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  # Eventually will be in the LLVM repo
+  hipcc = callPackage ./hipcc {
+    inherit rocmUpdateScript;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  clr = callPackage ./clr {
+    inherit rocmUpdateScript hip-common hipcc rocm-device-libs rocm-comgr rocm-runtime roctracer rocminfo;
+    inherit (llvm) clang;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  hipify = callPackage ./hipify {
+    inherit rocmUpdateScript;
+    inherit (llvm) clang;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  # Needs GCC
+  rocprofiler = callPackage ./rocprofiler {
+    inherit (llvm) clang;
+    inherit rocmUpdateScript clr rocm-thunk roctracer rocm-smi hsa-amd-aqlprofile-bin;
+  };
+
+  # Needs GCC
+  roctracer = callPackage ./roctracer {
+    inherit rocmUpdateScript rocm-device-libs rocm-runtime rocprofiler clr;
+    inherit (llvm) clang;
+  };
+
+  # Needs GCC
+  rocgdb = callPackage ./rocgdb {
+    inherit rocmUpdateScript;
+    elfutils = elfutils.override { enableDebuginfod = true; };
+  };
+
+  rocdbgapi = callPackage ./rocdbgapi {
+    inherit rocmUpdateScript rocm-cmake rocm-comgr rocm-runtime;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocr-debug-agent = callPackage ./rocr-debug-agent {
+    inherit rocmUpdateScript clr rocdbgapi;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  composable_kernel = callPackage ./composable_kernel {
+    inherit (llvm) openmp clang-tools-extra;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  hipcub = callPackage ./hipcub {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  hipsparse = callPackage ./hipsparse {
+    inherit (llvm) openmp;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  hipfort = callPackage ./hipfort {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  hipfft = callPackage ./hipfft {
+    inherit (llvm) openmp;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  hipsolver = callPackage ./hipsolver {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  hipblas = callPackage ./hipblas {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  migraphx = callPackage ./migraphx {
+    inherit (llvm) clang-tools-extra openmp;
+    stdenv = llvm.rocmClangStdenv;
+    rocmlir = rocmlir-rock;
+  };
+
+  rccl = callPackage ./rccl {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocalution = callPackage ./rocalution {
+    inherit (llvm) openmp;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocsolver = callPackage ./rocsolver {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocmlir = callPackage ./rocmlir {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocmlir-rock = rocmlir.override {
+    buildRockCompiler = true;
+  };
+
+  rocprim = callPackage ./rocprim {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocsparse = callPackage ./rocsparse {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocfft = callPackage ./rocfft {
+    inherit (llvm) openmp;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocrand = callPackage ./rocrand {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  tensile = python3Packages.callPackage ./tensile {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocwmma = callPackage ./rocwmma {
+    inherit (llvm) openmp;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocblas = callPackage ./rocblas {
+    inherit (llvm) openmp;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  miopengemm = callPackage ./miopengemm {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocthrust = callPackage ./rocthrust {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  miopen = callPackage ./miopen {
+    inherit (llvm) llvm clang-tools-extra;
+    stdenv = llvm.rocmClangStdenv;
+    rocmlir = rocmlir-rock;
+    boost = boost179.override { enableStatic = true; };
+  };
+
+  miopen-hip = miopen.override {
+    useOpenCL = false;
+  };
+
+  miopen-opencl = miopen.override {
+    useOpenCL = true;
+  };
 }
diff --git a/pkgs/development/compilers/hip-common/default.nix b/pkgs/development/rocm-modules/5/hip-common/default.nix
similarity index 79%
rename from pkgs/development/compilers/hip-common/default.nix
rename to pkgs/development/rocm-modules/5/hip-common/default.nix
index 754fea89ac5e..9f5f37511ef0 100644
--- a/pkgs/development/compilers/hip-common/default.nix
+++ b/pkgs/development/rocm-modules/5/hip-common/default.nix
@@ -2,11 +2,6 @@
 , stdenv
 , fetchFromGitHub
 , rocmUpdateScript
-, substituteAll
-, llvm
-, rocm-runtime
-, rocminfo
-, lsb-release
 }:
 
 stdenv.mkDerivation (finalAttrs: {
@@ -20,16 +15,6 @@ stdenv.mkDerivation (finalAttrs: {
     hash = "sha256-1Abit9qZCwrCVcnaFT4uMygFB9G6ovRasLmTsOsJ/Fw=";
   };
 
-  patches = [
-    (substituteAll {
-      src = ./0000-fixup-paths.patch;
-      inherit llvm rocminfo;
-      clang = stdenv.cc;
-      rocm_runtime = rocm-runtime;
-      lsb_release = lsb-release;
-    })
-  ];
-
   dontConfigure = true;
   dontBuild = true;
 
diff --git a/pkgs/development/libraries/hipblas/default.nix b/pkgs/development/rocm-modules/5/hipblas/default.nix
similarity index 97%
rename from pkgs/development/libraries/hipblas/default.nix
rename to pkgs/development/rocm-modules/5/hipblas/default.nix
index 845c5b9d0d7d..cb60e5de6633 100644
--- a/pkgs/development/libraries/hipblas/default.nix
+++ b/pkgs/development/rocm-modules/5/hipblas/default.nix
@@ -95,6 +95,6 @@ stdenv.mkDerivation (finalAttrs: {
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
     # Fixed in develop branch by using C++17 and related refactor
-    broken = versions.minor finalAttrs.version != versions.minor hip.version || buildTests || buildBenchmarks || buildSamples;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version || buildTests || buildBenchmarks || buildSamples;
   };
 })
diff --git a/pkgs/development/compilers/hipcc/default.nix b/pkgs/development/rocm-modules/5/hipcc/default.nix
similarity index 62%
rename from pkgs/development/compilers/hipcc/default.nix
rename to pkgs/development/rocm-modules/5/hipcc/default.nix
index b758d0e1ed96..e6610e8909f7 100644
--- a/pkgs/development/compilers/hipcc/default.nix
+++ b/pkgs/development/rocm-modules/5/hipcc/default.nix
@@ -2,11 +2,7 @@
 , stdenv
 , fetchFromGitHub
 , rocmUpdateScript
-, substituteAll
 , cmake
-, llvm
-, rocm-runtime
-, rocminfo
 , lsb-release
 }:
 
@@ -21,28 +17,16 @@ stdenv.mkDerivation (finalAttrs: {
     hash = "sha256-lJX6nF1V4YmK5ai7jivXlRnG3doIOf6X9CWLHVdRuVg=";
   };
 
-  patches = [
-    (substituteAll {
-      src = ./0000-fixup-paths.patch;
-      inherit llvm rocminfo;
-      version_major = lib.versions.major finalAttrs.version;
-      version_minor = lib.versions.minor finalAttrs.version;
-      version_patch = lib.versions.patch finalAttrs.version;
-      clang = stdenv.cc;
-      rocm_runtime = rocm-runtime;
-      lsb_release = lsb-release;
-    })
-  ];
-
   nativeBuildInputs = [ cmake ];
 
-  installPhase = ''
-    runHook preInstall
+  postPatch = ''
+    substituteInPlace src/hipBin_amd.h \
+      --replace "/usr/bin/lsb_release" "${lsb-release}/bin/lsb_release"
+  '';
 
-    mkdir -p $out/bin
-    mv *.bin $out/bin
-
-    runHook postInstall
+  postInstall = ''
+    rm -r $out/hip/bin
+    ln -s $out/bin $out/hip/bin
   '';
 
   passthru.updateScript = rocmUpdateScript {
diff --git a/pkgs/development/libraries/hipcub/default.nix b/pkgs/development/rocm-modules/5/hipcub/default.nix
similarity index 99%
rename from pkgs/development/libraries/hipcub/default.nix
rename to pkgs/development/rocm-modules/5/hipcub/default.nix
index fff34e1a0ec7..b3a23241366f 100644
--- a/pkgs/development/libraries/hipcub/default.nix
+++ b/pkgs/development/rocm-modules/5/hipcub/default.nix
@@ -82,6 +82,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ bsd3 ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/hipfft/default.nix b/pkgs/development/rocm-modules/5/hipfft/default.nix
similarity index 99%
rename from pkgs/development/libraries/hipfft/default.nix
rename to pkgs/development/rocm-modules/5/hipfft/default.nix
index c208296c687b..1e959f0ad56a 100644
--- a/pkgs/development/libraries/hipfft/default.nix
+++ b/pkgs/development/rocm-modules/5/hipfft/default.nix
@@ -102,6 +102,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/hipfort/default.nix b/pkgs/development/rocm-modules/5/hipfort/default.nix
similarity index 100%
rename from pkgs/development/libraries/hipfort/default.nix
rename to pkgs/development/rocm-modules/5/hipfort/default.nix
diff --git a/pkgs/development/compilers/hipify/default.nix b/pkgs/development/rocm-modules/5/hipify/default.nix
similarity index 83%
rename from pkgs/development/compilers/hipify/default.nix
rename to pkgs/development/rocm-modules/5/hipify/default.nix
index d7b243b9da04..893056496c9c 100644
--- a/pkgs/development/compilers/hipify/default.nix
+++ b/pkgs/development/rocm-modules/5/hipify/default.nix
@@ -3,6 +3,7 @@
 , fetchFromGitHub
 , rocmUpdateScript
 , cmake
+, clang
 , libxml2
 }:
 
@@ -22,7 +23,7 @@ stdenv.mkDerivation (finalAttrs: {
 
   postPatch = ''
     substituteInPlace CMakeLists.txt \
-      --replace "\''${LLVM_TOOLS_BINARY_DIR}/clang" "${stdenv.cc}/bin/clang"
+      --replace "\''${LLVM_TOOLS_BINARY_DIR}/clang" "${clang}/bin/clang"
   '';
 
   passthru.updateScript = rocmUpdateScript {
@@ -31,11 +32,11 @@ stdenv.mkDerivation (finalAttrs: {
     repo = finalAttrs.src.repo;
   };
 
-  # Fixup weird install paths
+  # Fixup bad symlinks
   postInstall = ''
-    mkdir -p $out/bin
-    mv $out/{*.sh,hipify-*} $out/bin
-    cp -afs $out/bin $out/hip
+    rm -r $out/hip/bin
+    ln -s $out/bin $out/hip/bin
+    patchShebangs $out/bin
   '';
 
   meta = with lib; {
diff --git a/pkgs/development/libraries/hipsolver/default.nix b/pkgs/development/rocm-modules/5/hipsolver/default.nix
similarity index 99%
rename from pkgs/development/libraries/hipsolver/default.nix
rename to pkgs/development/rocm-modules/5/hipsolver/default.nix
index cd689856d418..238564d631bc 100644
--- a/pkgs/development/libraries/hipsolver/default.nix
+++ b/pkgs/development/rocm-modules/5/hipsolver/default.nix
@@ -95,6 +95,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/hipsparse/default.nix b/pkgs/development/rocm-modules/5/hipsparse/default.nix
similarity index 99%
rename from pkgs/development/libraries/hipsparse/default.nix
rename to pkgs/development/rocm-modules/5/hipsparse/default.nix
index 45a571735b70..6e6197209e4b 100644
--- a/pkgs/development/libraries/hipsparse/default.nix
+++ b/pkgs/development/rocm-modules/5/hipsparse/default.nix
@@ -131,6 +131,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/migraphx/default.nix b/pkgs/development/rocm-modules/5/migraphx/default.nix
similarity index 99%
rename from pkgs/development/libraries/migraphx/default.nix
rename to pkgs/development/rocm-modules/5/migraphx/default.nix
index 2a842a3c24dd..fdc97f45da9d 100644
--- a/pkgs/development/libraries/migraphx/default.nix
+++ b/pkgs/development/rocm-modules/5/migraphx/default.nix
@@ -155,6 +155,6 @@ in stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/miopen/default.nix b/pkgs/development/rocm-modules/5/miopen/default.nix
similarity index 99%
rename from pkgs/development/libraries/miopen/default.nix
rename to pkgs/development/rocm-modules/5/miopen/default.nix
index 5345c2216e66..dd7661a09756 100644
--- a/pkgs/development/libraries/miopen/default.nix
+++ b/pkgs/development/rocm-modules/5/miopen/default.nix
@@ -185,6 +185,6 @@ in stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/miopen/deps.nix b/pkgs/development/rocm-modules/5/miopen/deps.nix
similarity index 100%
rename from pkgs/development/libraries/miopen/deps.nix
rename to pkgs/development/rocm-modules/5/miopen/deps.nix
diff --git a/pkgs/development/libraries/miopengemm/default.nix b/pkgs/development/rocm-modules/5/miopengemm/default.nix
similarity index 100%
rename from pkgs/development/libraries/miopengemm/default.nix
rename to pkgs/development/rocm-modules/5/miopengemm/default.nix
diff --git a/pkgs/development/libraries/rccl/default.nix b/pkgs/development/rocm-modules/5/rccl/default.nix
similarity index 98%
rename from pkgs/development/libraries/rccl/default.nix
rename to pkgs/development/rocm-modules/5/rccl/default.nix
index acd0030cabd9..68dafc29c164 100644
--- a/pkgs/development/libraries/rccl/default.nix
+++ b/pkgs/development/rocm-modules/5/rccl/default.nix
@@ -78,6 +78,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ bsd2 bsd3 ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/tools/misc/rdc/default.nix b/pkgs/development/rocm-modules/5/rdc/default.nix
similarity index 94%
rename from pkgs/development/tools/misc/rdc/default.nix
rename to pkgs/development/rocm-modules/5/rdc/default.nix
index d2a7f46dc849..134b946c5f7a 100644
--- a/pkgs/development/tools/misc/rdc/default.nix
+++ b/pkgs/development/rocm-modules/5/rdc/default.nix
@@ -120,6 +120,7 @@ in stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor rocm-smi.version;
+    # broken = versions.minor finalAttrs.version != versions.minor rocm-smi.version;
+    broken = true; # Too many errors, unsure how to fix
   };
 })
diff --git a/pkgs/development/libraries/rocalution/default.nix b/pkgs/development/rocm-modules/5/rocalution/default.nix
similarity index 99%
rename from pkgs/development/libraries/rocalution/default.nix
rename to pkgs/development/rocm-modules/5/rocalution/default.nix
index f67384a95f08..650e9dc7a1ca 100644
--- a/pkgs/development/libraries/rocalution/default.nix
+++ b/pkgs/development/rocm-modules/5/rocalution/default.nix
@@ -110,6 +110,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/rocblas/default.nix b/pkgs/development/rocm-modules/5/rocblas/default.nix
similarity index 99%
rename from pkgs/development/libraries/rocblas/default.nix
rename to pkgs/development/rocm-modules/5/rocblas/default.nix
index 78d0e7df8b24..59d23ad121da 100644
--- a/pkgs/development/libraries/rocblas/default.nix
+++ b/pkgs/development/rocm-modules/5/rocblas/default.nix
@@ -134,6 +134,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/rocdbgapi/default.nix b/pkgs/development/rocm-modules/5/rocdbgapi/default.nix
similarity index 87%
rename from pkgs/development/libraries/rocdbgapi/default.nix
rename to pkgs/development/rocm-modules/5/rocdbgapi/default.nix
index a1cfacf33c27..41c1178f1089 100644
--- a/pkgs/development/libraries/rocdbgapi/default.nix
+++ b/pkgs/development/rocm-modules/5/rocdbgapi/default.nix
@@ -7,6 +7,7 @@
 , git
 , rocm-comgr
 , rocm-runtime
+, hwdata
 , texlive
 , doxygen
 , graphviz
@@ -65,6 +66,16 @@ in stdenv.mkDerivation (finalAttrs: {
   buildInputs = [
     rocm-comgr
     rocm-runtime
+    hwdata
+  ];
+
+  cmakeFlags = [
+    "-DPCI_IDS_PATH=${hwdata}/share/hwdata"
+    # Manually define CMAKE_INSTALL_<DIR>
+    # See: https://github.com/NixOS/nixpkgs/pull/197838
+    "-DCMAKE_INSTALL_BINDIR=bin"
+    "-DCMAKE_INSTALL_LIBDIR=lib"
+    "-DCMAKE_INSTALL_INCLUDEDIR=include"
   ];
 
   # Unfortunately, it seems like we have to call make on this manually
diff --git a/pkgs/development/libraries/rocfft/default.nix b/pkgs/development/rocm-modules/5/rocfft/default.nix
similarity index 99%
rename from pkgs/development/libraries/rocfft/default.nix
rename to pkgs/development/rocm-modules/5/rocfft/default.nix
index 8eed31b8b233..ee1078eabb28 100644
--- a/pkgs/development/libraries/rocfft/default.nix
+++ b/pkgs/development/rocm-modules/5/rocfft/default.nix
@@ -238,6 +238,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = with maintainers; [ kira-bruneau ] ++ teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/rocfft/device-install.patch b/pkgs/development/rocm-modules/5/rocfft/device-install.patch
similarity index 100%
rename from pkgs/development/libraries/rocfft/device-install.patch
rename to pkgs/development/rocm-modules/5/rocfft/device-install.patch
diff --git a/pkgs/development/libraries/rocfft/split-kernel-compilation.patch b/pkgs/development/rocm-modules/5/rocfft/split-kernel-compilation.patch
similarity index 100%
rename from pkgs/development/libraries/rocfft/split-kernel-compilation.patch
rename to pkgs/development/rocm-modules/5/rocfft/split-kernel-compilation.patch
diff --git a/pkgs/development/tools/misc/rocgdb/default.nix b/pkgs/development/rocm-modules/5/rocgdb/default.nix
similarity index 100%
rename from pkgs/development/tools/misc/rocgdb/default.nix
rename to pkgs/development/rocm-modules/5/rocgdb/default.nix
diff --git a/pkgs/development/tools/build-managers/rocm-cmake/default.nix b/pkgs/development/rocm-modules/5/rocm-cmake/default.nix
similarity index 91%
rename from pkgs/development/tools/build-managers/rocm-cmake/default.nix
rename to pkgs/development/rocm-modules/5/rocm-cmake/default.nix
index 9e9cf3caf12e..04ae947d3a4a 100644
--- a/pkgs/development/tools/build-managers/rocm-cmake/default.nix
+++ b/pkgs/development/rocm-modules/5/rocm-cmake/default.nix
@@ -22,6 +22,8 @@ stdenv.mkDerivation (finalAttrs: {
     name = finalAttrs.pname;
     owner = finalAttrs.src.owner;
     repo = finalAttrs.src.repo;
+    page = "releases?per_page=2";
+    filter = ".[1].tag_name | split(\"-\") | .[1]";
   };
 
   meta = with lib; {
diff --git a/pkgs/development/libraries/rocm-comgr/default.nix b/pkgs/development/rocm-modules/5/rocm-comgr/default.nix
similarity index 97%
rename from pkgs/development/libraries/rocm-comgr/default.nix
rename to pkgs/development/rocm-modules/5/rocm-comgr/default.nix
index 4d84af3afa54..c3c4c5fab3cd 100644
--- a/pkgs/development/libraries/rocm-comgr/default.nix
+++ b/pkgs/development/rocm-modules/5/rocm-comgr/default.nix
@@ -24,7 +24,6 @@ in stdenv.mkDerivation (finalAttrs: {
     hash = "sha256-QB3G0V92UTW67hD6+zSuExN1+eMT820iYSlMyZeWSFw=";
   };
 
-  patches = [ ./cmake.patch ];
   sourceRoot = "${finalAttrs.src.name}/lib/comgr";
 
   nativeBuildInputs = [
diff --git a/pkgs/development/libraries/rocm-core/default.nix b/pkgs/development/rocm-modules/5/rocm-core/default.nix
similarity index 100%
rename from pkgs/development/libraries/rocm-core/default.nix
rename to pkgs/development/rocm-modules/5/rocm-core/default.nix
diff --git a/pkgs/development/libraries/rocm-device-libs/cmake.patch b/pkgs/development/rocm-modules/5/rocm-device-libs/cmake.patch
similarity index 100%
rename from pkgs/development/libraries/rocm-device-libs/cmake.patch
rename to pkgs/development/rocm-modules/5/rocm-device-libs/cmake.patch
diff --git a/pkgs/development/libraries/rocm-device-libs/default.nix b/pkgs/development/rocm-modules/5/rocm-device-libs/default.nix
similarity index 100%
rename from pkgs/development/libraries/rocm-device-libs/default.nix
rename to pkgs/development/rocm-modules/5/rocm-device-libs/default.nix
diff --git a/pkgs/development/libraries/rocm-runtime/default.nix b/pkgs/development/rocm-modules/5/rocm-runtime/default.nix
similarity index 87%
rename from pkgs/development/libraries/rocm-runtime/default.nix
rename to pkgs/development/rocm-modules/5/rocm-runtime/default.nix
index dfb10c363153..fd9182c8254d 100644
--- a/pkgs/development/libraries/rocm-runtime/default.nix
+++ b/pkgs/development/rocm-modules/5/rocm-runtime/default.nix
@@ -50,8 +50,10 @@ stdenv.mkDerivation (finalAttrs: {
       --replace 'hsa/include/hsa' 'include/hsa'
 
     # We compile clang before rocm-device-libs, so patch it in afterwards
+    # Replace object version: https://github.com/RadeonOpenCompute/ROCR-Runtime/issues/166 (TODO: Remove on LLVM update?)
     substituteInPlace image/blit_src/CMakeLists.txt \
-      --replace '-cl-denorms-are-zero' '-cl-denorms-are-zero --rocm-device-lib-path=${rocm-device-libs}/amdgcn/bitcode'
+      --replace '-cl-denorms-are-zero' '-cl-denorms-are-zero --rocm-device-lib-path=${rocm-device-libs}/amdgcn/bitcode' \
+      --replace '-mcode-object-version=4' '-mcode-object-version=5'
   '';
 
   fixupPhase = ''
diff --git a/pkgs/tools/system/rocm-smi/cmake.patch b/pkgs/development/rocm-modules/5/rocm-smi/cmake.patch
similarity index 100%
rename from pkgs/tools/system/rocm-smi/cmake.patch
rename to pkgs/development/rocm-modules/5/rocm-smi/cmake.patch
diff --git a/pkgs/tools/system/rocm-smi/default.nix b/pkgs/development/rocm-modules/5/rocm-smi/default.nix
similarity index 72%
rename from pkgs/tools/system/rocm-smi/default.nix
rename to pkgs/development/rocm-modules/5/rocm-smi/default.nix
index 2fa79828c63b..2e1692539e23 100644
--- a/pkgs/tools/system/rocm-smi/default.nix
+++ b/pkgs/development/rocm-modules/5/rocm-smi/default.nix
@@ -17,16 +17,24 @@ stdenv.mkDerivation (finalAttrs: {
     hash = "sha256-swCRO4PBMBJ6fO2bLq/xxFZIYw2IgiFB490wsU8Wm2o=";
   };
 
-  postPatch = ''
-    sed '1i#include <cstring>' -i src/rocm_smi{,_gpu_metrics}.cc # since gcc12 probably
-  '';
-
-  nativeBuildInputs = [ cmake wrapPython ];
-
   patches = [ ./cmake.patch ];
 
+  nativeBuildInputs = [
+    cmake
+    wrapPython
+  ];
+
+  cmakeFlags = [
+    # Manually define CMAKE_INSTALL_<DIR>
+    # See: https://github.com/NixOS/nixpkgs/pull/197838
+    "-DCMAKE_INSTALL_BINDIR=bin"
+    "-DCMAKE_INSTALL_LIBDIR=lib"
+    "-DCMAKE_INSTALL_INCLUDEDIR=include"
+  ];
+
   postInstall = ''
     wrapPythonProgramsIn $out
+    mv $out/libexec/rocm_smi/.rsmiBindings.py-wrapped $out/libexec/rocm_smi/rsmiBindings.py
   '';
 
   passthru.updateScript = rocmUpdateScript {
diff --git a/pkgs/development/libraries/rocm-thunk/default.nix b/pkgs/development/rocm-modules/5/rocm-thunk/default.nix
similarity index 92%
rename from pkgs/development/libraries/rocm-thunk/default.nix
rename to pkgs/development/rocm-modules/5/rocm-thunk/default.nix
index 8a4ad2a098c6..73368dbb0e7f 100644
--- a/pkgs/development/libraries/rocm-thunk/default.nix
+++ b/pkgs/development/rocm-modules/5/rocm-thunk/default.nix
@@ -4,11 +4,8 @@
 , rocmUpdateScript
 , pkg-config
 , cmake
-, rocm-cmake
 , libdrm
 , numactl
-, valgrind
-, gcc
 }:
 
 stdenv.mkDerivation (finalAttrs: {
@@ -25,14 +22,11 @@ stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     pkg-config
     cmake
-    rocm-cmake
   ];
 
   buildInputs = [
     libdrm
     numactl
-    valgrind
-    gcc.cc.libgcc or null # TODO: unhack this?
   ];
 
   cmakeFlags = [
diff --git a/pkgs/development/tools/rocminfo/default.nix b/pkgs/development/rocm-modules/5/rocminfo/default.nix
similarity index 100%
rename from pkgs/development/tools/rocminfo/default.nix
rename to pkgs/development/rocm-modules/5/rocminfo/default.nix
diff --git a/pkgs/development/libraries/rocmlir/default.nix b/pkgs/development/rocm-modules/5/rocmlir/default.nix
similarity index 100%
rename from pkgs/development/libraries/rocmlir/default.nix
rename to pkgs/development/rocm-modules/5/rocmlir/default.nix
diff --git a/pkgs/development/libraries/rocprim/default.nix b/pkgs/development/rocm-modules/5/rocprim/default.nix
similarity index 98%
rename from pkgs/development/libraries/rocprim/default.nix
rename to pkgs/development/rocm-modules/5/rocprim/default.nix
index b38684b24dd6..e8233547664f 100644
--- a/pkgs/development/libraries/rocprim/default.nix
+++ b/pkgs/development/rocm-modules/5/rocprim/default.nix
@@ -77,6 +77,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/rocprofiler/default.nix b/pkgs/development/rocm-modules/5/rocprofiler/default.nix
similarity index 52%
rename from pkgs/development/libraries/rocprofiler/default.nix
rename to pkgs/development/rocm-modules/5/rocprofiler/default.nix
index 97f269beb84e..ec24a3f41e59 100644
--- a/pkgs/development/libraries/rocprofiler/default.nix
+++ b/pkgs/development/rocm-modules/5/rocprofiler/default.nix
@@ -3,10 +3,32 @@
 , fetchFromGitHub
 , rocmUpdateScript
 , cmake
-, rocm-runtime
+, clang
+, clr
 , rocm-thunk
 , roctracer
+, rocm-smi
+, hsa-amd-aqlprofile-bin
 , numactl
+, libpciaccess
+, libxml2
+, elfutils
+, mpi
+, gtest
+, python3Packages
+, gpuTargets ? [
+  "gfx900"
+  "gfx906"
+  "gfx908"
+  "gfx90a"
+  "gfx940"
+  "gfx941"
+  "gfx942"
+  "gfx1030"
+  "gfx1100"
+  "gfx1101"
+  "gfx1102"
+]
 }:
 
 stdenv.mkDerivation (finalAttrs: {
@@ -20,17 +42,33 @@ stdenv.mkDerivation (finalAttrs: {
     hash = "sha256-ue/2uiLbhOv/5XY4cIJuZ8DUMRhniYgxolq9xMwO1FY=";
   };
 
-  patches = [ ./0000-dont-require-hsa_amd_aqlprofile.patch ];
-  nativeBuildInputs = [ cmake ];
+  nativeBuildInputs = [
+    cmake
+    clang
+    clr
+    python3Packages.lxml
+    python3Packages.cppheaderparser
+    python3Packages.pyyaml
+    python3Packages.barectf
+  ];
 
   buildInputs = [
     rocm-thunk
-    rocm-runtime
+    rocm-smi
+    hsa-amd-aqlprofile-bin
     numactl
+    libpciaccess
+    libxml2
+    elfutils
+    mpi
+    gtest
   ];
 
   cmakeFlags = [
+    "-DCMAKE_MODULE_PATH=${clr}/lib/cmake/hip"
     "-DPROF_API_HEADER_PATH=${roctracer.src}/inc/ext"
+    "-DHIP_ROOT_DIR=${clr}"
+    "-DGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
     # Manually define CMAKE_INSTALL_<DIR>
     # See: https://github.com/NixOS/nixpkgs/pull/197838
     "-DCMAKE_INSTALL_BINDIR=bin"
@@ -39,14 +77,18 @@ stdenv.mkDerivation (finalAttrs: {
   ];
 
   postPatch = ''
-    patchShebangs bin test
+    patchShebangs .
 
-    substituteInPlace cmake_modules/env.cmake \
-      --replace "FATAL_ERROR \"AQL_PROFILE" "WARNING \"AQL_PROFILE"
+    # Cannot find ROCm device library, pointless
+    substituteInPlace CMakeLists.txt \
+      --replace "add_subdirectory(tests-v2)" "" \
+      --replace "add_subdirectory(samples)" ""
   '';
 
-  postInstall = ''
-    patchelf --set-rpath $out/lib:${lib.makeLibraryPath finalAttrs.buildInputs} $out/lib/rocprofiler/librocprof-tool.so
+  postBuild = ''
+    # HSACO aren't being built for some reason
+    substituteInPlace test/cmake_install.cmake \
+      --replace "file(INSTALL DESTINATION \"\''${CMAKE_INSTALL_PREFIX}/share/rocprofiler/tests-v1\" TYPE FILE FILES \"" "message(\""
   '';
 
   passthru.updateScript = rocmUpdateScript {
@@ -61,6 +103,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ]; # mitx11
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
+    broken = versions.minor finalAttrs.version != versions.minor clr.version;
   };
 })
diff --git a/pkgs/development/libraries/rocr-debug-agent/default.nix b/pkgs/development/rocm-modules/5/rocr-debug-agent/default.nix
similarity index 87%
rename from pkgs/development/libraries/rocr-debug-agent/default.nix
rename to pkgs/development/rocm-modules/5/rocr-debug-agent/default.nix
index 08d45f304a4f..dfc8580b3e14 100644
--- a/pkgs/development/libraries/rocr-debug-agent/default.nix
+++ b/pkgs/development/rocm-modules/5/rocr-debug-agent/default.nix
@@ -3,10 +3,9 @@
 , fetchFromGitHub
 , rocmUpdateScript
 , cmake
-, hip
+, clr
 , git
 , rocdbgapi
-, rocm-runtime
 , elfutils
 }:
 
@@ -23,20 +22,19 @@ stdenv.mkDerivation (finalAttrs: {
 
   nativeBuildInputs = [
     cmake
-    hip
+    clr
     git
   ];
 
   buildInputs = [
     rocdbgapi
-    rocm-runtime
     elfutils
   ];
 
   cmakeFlags = [
-    "-DCMAKE_MODULE_PATH=${hip}/lib/cmake/hip"
-    "-DHIP_ROOT_DIR=${hip}"
-    "-DHIP_PATH=${hip}"
+    "-DCMAKE_MODULE_PATH=${clr}/lib/cmake/hip"
+    "-DHIP_ROOT_DIR=${clr}"
+    "-DHIP_PATH=${clr}"
   ];
 
   # Weird install target
@@ -56,6 +54,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ ncsa ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/rocrand/default.nix b/pkgs/development/rocm-modules/5/rocrand/default.nix
similarity index 99%
rename from pkgs/development/libraries/rocrand/default.nix
rename to pkgs/development/rocm-modules/5/rocrand/default.nix
index 8ea138d4c3cd..daa24b870ceb 100644
--- a/pkgs/development/libraries/rocrand/default.nix
+++ b/pkgs/development/rocm-modules/5/rocrand/default.nix
@@ -80,6 +80,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/rocsolver/default.nix b/pkgs/development/rocm-modules/5/rocsolver/default.nix
similarity index 99%
rename from pkgs/development/libraries/rocsolver/default.nix
rename to pkgs/development/rocm-modules/5/rocsolver/default.nix
index c78b4d97a0ae..3a0858af6335 100644
--- a/pkgs/development/libraries/rocsolver/default.nix
+++ b/pkgs/development/rocm-modules/5/rocsolver/default.nix
@@ -90,6 +90,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ bsd2 ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/rocsparse/default.nix b/pkgs/development/rocm-modules/5/rocsparse/default.nix
similarity index 99%
rename from pkgs/development/libraries/rocsparse/default.nix
rename to pkgs/development/rocm-modules/5/rocsparse/default.nix
index a93d7a77bf26..d97951530119 100644
--- a/pkgs/development/libraries/rocsparse/default.nix
+++ b/pkgs/development/rocm-modules/5/rocsparse/default.nix
@@ -141,6 +141,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/rocsparse/deps.nix b/pkgs/development/rocm-modules/5/rocsparse/deps.nix
similarity index 100%
rename from pkgs/development/libraries/rocsparse/deps.nix
rename to pkgs/development/rocm-modules/5/rocsparse/deps.nix
diff --git a/pkgs/development/libraries/rocthrust/default.nix b/pkgs/development/rocm-modules/5/rocthrust/default.nix
similarity index 99%
rename from pkgs/development/libraries/rocthrust/default.nix
rename to pkgs/development/rocm-modules/5/rocthrust/default.nix
index a4981d3fb270..e441709f89f7 100644
--- a/pkgs/development/libraries/rocthrust/default.nix
+++ b/pkgs/development/rocm-modules/5/rocthrust/default.nix
@@ -79,6 +79,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ asl20 ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/roctracer/default.nix b/pkgs/development/rocm-modules/5/roctracer/default.nix
similarity index 93%
rename from pkgs/development/libraries/roctracer/default.nix
rename to pkgs/development/rocm-modules/5/roctracer/default.nix
index 3aeb8e3ba198..92e557426b10 100644
--- a/pkgs/development/libraries/roctracer/default.nix
+++ b/pkgs/development/rocm-modules/5/roctracer/default.nix
@@ -4,7 +4,7 @@
 , rocmUpdateScript
 , cmake
 , clang
-, hip
+, clr
 , rocm-device-libs
 , rocprofiler
 , libxml2
@@ -39,14 +39,13 @@ stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     clang
-    hip
+    clr
   ] ++ lib.optionals buildDocs [
     doxygen
     graphviz
   ];
 
   buildInputs = [
-    rocm-device-libs
     rocprofiler
     libxml2
     python3Packages.python
@@ -54,7 +53,7 @@ stdenv.mkDerivation (finalAttrs: {
   ];
 
   cmakeFlags = [
-    "-DCMAKE_MODULE_PATH=${hip}/hip/cmake"
+    "-DCMAKE_MODULE_PATH=${clr}/hip/cmake"
     # Manually define CMAKE_INSTALL_<DIR>
     # See: https://github.com/NixOS/nixpkgs/pull/197838
     "-DCMAKE_INSTALL_BINDIR=bin"
@@ -85,7 +84,7 @@ stdenv.mkDerivation (finalAttrs: {
     find $out/test -executable -type f -exec mv {} $test/bin \;
     rm $test/bin/{*.sh,*.py}
     patchelf --set-rpath $out/lib:${lib.makeLibraryPath (
-      finalAttrs.buildInputs ++ [ hip gcc-unwrapped.lib rocm-runtime ])} $test/bin/*
+      finalAttrs.buildInputs ++ [ clr gcc-unwrapped.lib rocm-runtime ])} $test/bin/*
     rm -rf $out/test
   '';
 
@@ -101,6 +100,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ]; # mitx11
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor clr.version;
   };
 })
diff --git a/pkgs/development/libraries/rocwmma/0000-dont-fetch-googletest.patch b/pkgs/development/rocm-modules/5/rocwmma/0000-dont-fetch-googletest.patch
similarity index 100%
rename from pkgs/development/libraries/rocwmma/0000-dont-fetch-googletest.patch
rename to pkgs/development/rocm-modules/5/rocwmma/0000-dont-fetch-googletest.patch
diff --git a/pkgs/development/libraries/rocwmma/default.nix b/pkgs/development/rocm-modules/5/rocwmma/default.nix
similarity index 99%
rename from pkgs/development/libraries/rocwmma/default.nix
rename to pkgs/development/rocm-modules/5/rocwmma/default.nix
index 84db5b4dbebf..ef21ed86248a 100644
--- a/pkgs/development/libraries/rocwmma/default.nix
+++ b/pkgs/development/rocm-modules/5/rocwmma/default.nix
@@ -141,6 +141,6 @@ in stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor hip.version;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/libraries/tensile/default.nix b/pkgs/development/rocm-modules/5/tensile/default.nix
similarity index 100%
rename from pkgs/development/libraries/tensile/default.nix
rename to pkgs/development/rocm-modules/5/tensile/default.nix
diff --git a/pkgs/development/rocm-modules/5/update.nix b/pkgs/development/rocm-modules/5/update.nix
index abd434776ef9..1cc7e354d240 100644
--- a/pkgs/development/rocm-modules/5/update.nix
+++ b/pkgs/development/rocm-modules/5/update.nix
@@ -12,7 +12,7 @@
 let
   pname =
     if lib.hasPrefix "rocm-llvm-" name
-    then "rocmPackages_5.llvm.${lib.removePrefix "rocm-llvm-" name}"
+    then "llvm.${lib.removePrefix "rocm-llvm-" name}"
     else name;
 
   updateScript = writeScript "update.sh" ''
@@ -27,6 +27,6 @@ let
       version="''${version}.0"
     fi
 
-    update-source-version ${pname} "$version" --ignore-same-hash
+    update-source-version rocmPackages_5.${pname} "$version" --ignore-same-hash
   '';
 in [ updateScript ]
diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index 4e03557c7740..d365df0eb0f0 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -16957,221 +16957,8 @@ with pkgs;
 
   rml = callPackage ../development/compilers/rml { };
 
-  composable_kernel = callPackage ../development/libraries/composable_kernel {
-    inherit (rocmPackages.llvm) openmp clang-tools-extra;
-    stdenv = rocmClangStdenv;
-  };
-
-  rocprofiler = callPackage ../development/libraries/rocprofiler {
-    stdenv = rocmClangStdenv;
-  };
-
-  clang-ocl = callPackage ../development/libraries/clang-ocl {
-    stdenv = rocmClangStdenv;
-  };
-
   rgxg = callPackage ../tools/text/rgxg { };
 
-  rocclr = callPackage ../development/libraries/rocclr {
-    stdenv = rocmClangStdenv;
-  };
-
-  hip-common = callPackage ../development/compilers/hip-common {
-    inherit (rocmPackages.llvm) llvm;
-    stdenv = rocmClangStdenv;
-  };
-
-  hipcc = callPackage ../development/compilers/hipcc {
-    inherit (rocmPackages.llvm) llvm;
-    stdenv = rocmClangStdenv;
-  };
-
-  hip = callPackage ../development/compilers/hip {
-    inherit (rocmPackages.llvm) llvm;
-    inherit (cudaPackages) cudatoolkit;
-    stdenv = rocmClangStdenv;
-  };
-
-  hip-amd = hip.override {
-    useNVIDIA = false;
-  };
-
-  hip-nvidia = hip.override {
-    useNVIDIA = true;
-  };
-
-  hipify = callPackage ../development/compilers/hipify {
-    stdenv = rocmClangStdenv;
-  };
-
-  hipcub = callPackage ../development/libraries/hipcub {
-    stdenv = rocmClangStdenv;
-  };
-
-  hipsparse = callPackage ../development/libraries/hipsparse {
-    inherit (rocmPackages.llvm) openmp;
-    stdenv = rocmClangStdenv;
-  };
-
-  hipfort = callPackage ../development/libraries/hipfort {
-    stdenv = rocmClangStdenv;
-  };
-
-  hipfft = callPackage ../development/libraries/hipfft {
-    inherit (rocmPackages.llvm) openmp;
-    stdenv = rocmClangStdenv;
-  };
-
-  hipsolver = callPackage ../development/libraries/hipsolver {
-    stdenv = rocmClangStdenv;
-  };
-
-  hipblas = callPackage ../development/libraries/hipblas {
-    stdenv = rocmClangStdenv;
-  };
-
-  migraphx = callPackage ../development/libraries/migraphx {
-    inherit (rocmPackages.llvm) clang-tools-extra openmp;
-    stdenv = rocmClangStdenv;
-    rocmlir = rocmlir-rock;
-  };
-
-  rccl = callPackage ../development/libraries/rccl {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocm-cmake = callPackage ../development/tools/build-managers/rocm-cmake {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocm-comgr = callPackage ../development/libraries/rocm-comgr {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocalution = callPackage ../development/libraries/rocalution {
-    inherit (rocmPackages.llvm) openmp;
-    stdenv = rocmClangStdenv;
-  };
-
-  rocm-device-libs = callPackage ../development/libraries/rocm-device-libs {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocm-opencl-icd = callPackage ../development/libraries/rocm-opencl-icd {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocsolver = callPackage ../development/libraries/rocsolver {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocm-opencl-runtime = callPackage ../development/libraries/rocm-opencl-runtime {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocm-runtime = callPackage ../development/libraries/rocm-runtime {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocm-smi = python3Packages.callPackage ../tools/system/rocm-smi {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocm-thunk = callPackage ../development/libraries/rocm-thunk {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocminfo = callPackage ../development/tools/rocminfo {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocmlir = callPackage ../development/libraries/rocmlir {
-    stdenv = rocmClangStdenv;
-  };
-
-  # Best just use GCC here
-  rdc = callPackage ../development/tools/misc/rdc { };
-
-  # Best just use GCC here
-  rocgdb = callPackage ../development/tools/misc/rocgdb {
-    elfutils = elfutils.override { enableDebuginfod = true; };
-  };
-
-  rocdbgapi = callPackage ../development/libraries/rocdbgapi {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocr-debug-agent = callPackage ../development/libraries/rocr-debug-agent {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocmlir-rock = rocmlir.override {
-    buildRockCompiler = true;
-  };
-
-  rocm-core = callPackage ../development/libraries/rocm-core {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocprim = callPackage ../development/libraries/rocprim {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocsparse = callPackage ../development/libraries/rocsparse {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocfft = callPackage ../development/libraries/rocfft {
-    inherit (rocmPackages.llvm) openmp;
-    stdenv = rocmClangStdenv;
-  };
-
-  rocrand = callPackage ../development/libraries/rocrand {
-    stdenv = rocmClangStdenv;
-  };
-
-  tensile = python3Packages.callPackage ../development/libraries/tensile {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocwmma = callPackage ../development/libraries/rocwmma {
-    inherit (rocmPackages.llvm) openmp;
-    stdenv = rocmClangStdenv;
-  };
-
-  rocblas = callPackage ../development/libraries/rocblas {
-    inherit (rocmPackages.llvm) openmp;
-    stdenv = rocmClangStdenv;
-  };
-
-  miopengemm = callPackage ../development/libraries/miopengemm {
-    stdenv = rocmClangStdenv;
-  };
-
-  rocthrust = callPackage ../development/libraries/rocthrust {
-    stdenv = rocmClangStdenv;
-  };
-
-  miopen = callPackage ../development/libraries/miopen {
-    inherit (rocmPackages.llvm) llvm clang-tools-extra;
-    stdenv = rocmClangStdenv;
-    rocmlir = rocmlir-rock;
-    boost = boost179.override { enableStatic = true; };
-  };
-
-  miopen-hip = miopen.override {
-    useOpenCL = false;
-  };
-
-  miopen-opencl = miopen.override {
-    useOpenCL = true;
-  };
-
-  # Requires GCC
-  roctracer = callPackage ../development/libraries/roctracer {
-    inherit (rocmPackages.llvm) clang;
-  };
-
   rtags = callPackage ../development/tools/rtags {
     inherit (darwin) apple_sdk;
   };
@@ -30903,6 +30690,7 @@ with pkgs;
     # LLVM 11 crashes when compiling GHOST_SystemCocoa.mm
     stdenv = if stdenv.isDarwin then llvmPackages_10.stdenv else stdenv;
     inherit (darwin.apple_sdk.frameworks) Cocoa CoreGraphics ForceFeedback OpenAL OpenGL;
+    inherit (rocmPackages) hip;
   };
 
   blender-with-packages = callPackage ../applications/misc/blender/wrapper.nix { };
@@ -39476,6 +39264,7 @@ with pkgs;
 
   inherit (callPackage ../development/libraries/science/math/magma {
     inherit (rocmPackages.llvm) openmp;
+    inherit (rocmPackages) hip hipblas hipsparse;
   }) magma magma_2_7_2 magma_2_6_2;
 
   magma-cuda = magma.override {

From 5021abc8de0deec0eb36c14b193aff7b84bb1adc Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Mon, 2 Oct 2023 00:57:40 -0500
Subject: [PATCH 13/30] pythonPackages: barectf: init at 3.1.2

---
 .../python-modules/barectf/default.nix        | 44 +++++++++++++++++++
 pkgs/top-level/python-packages.nix            |  2 +
 2 files changed, 46 insertions(+)
 create mode 100644 pkgs/development/python-modules/barectf/default.nix

diff --git a/pkgs/development/python-modules/barectf/default.nix b/pkgs/development/python-modules/barectf/default.nix
new file mode 100644
index 000000000000..429e03fbe940
--- /dev/null
+++ b/pkgs/development/python-modules/barectf/default.nix
@@ -0,0 +1,44 @@
+{ lib
+, buildPythonPackage
+, fetchFromGitHub
+, poetry-core
+, pytestCheckHook
+, setuptools
+, jsonschema
+, pyyaml
+, jinja2
+, termcolor
+}:
+
+buildPythonPackage rec {
+  pname = "barectf";
+  version = "3.1.2";
+  format = "pyproject";
+
+  src = fetchFromGitHub {
+    owner = "efficios";
+    repo = "barectf";
+    rev = "v${version}";
+    hash = "sha256-JelFfd3WS012dveNlIljhLdyPmgE9VEOXoZE3MBA/Gw=";
+  };
+
+  nativeBuildInputs = [ poetry-core ];
+  nativeCheckInputs = [ pytestCheckHook ];
+
+  propagatedBuildInputs = [
+    setuptools # needs pkg_resources at runtime
+    jsonschema
+    pyyaml
+    jinja2
+    termcolor
+  ];
+
+  pythonImportsCheck = [ "barectf" ];
+
+  meta = with lib; {
+    description = "Generator of ANSI C tracers which output CTF data streams ";
+    homepage = "https://github.com/efficios/barectf";
+    license = licenses.mit;
+    maintainers = with maintainers; [ Madouura ];
+  };
+}
diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix
index 4521548cb3d8..c0857bf850f8 100644
--- a/pkgs/top-level/python-packages.nix
+++ b/pkgs/top-level/python-packages.nix
@@ -1297,6 +1297,8 @@ self: super: with self; {
     inherit (pkgs.ocaml-ng.ocamlPackages) bap;
   };
 
+  barectf = callPackage ../development/python-modules/barectf { };
+
   baron = callPackage ../development/python-modules/baron { };
 
   base36 = callPackage ../development/python-modules/base36 { };

From 91fc34e29f430238c20010855c001619843bd150 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Mon, 2 Oct 2023 02:00:19 -0500
Subject: [PATCH 14/30] rocmPackages.hsa-amd-aqlprofile-bin: init at 5.7.0

---
 pkgs/development/rocm-modules/5/default.nix   |  5 +++
 .../5/hsa-amd-aqlprofile-bin/default.nix      | 45 +++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 pkgs/development/rocm-modules/5/hsa-amd-aqlprofile-bin/default.nix

diff --git a/pkgs/development/rocm-modules/5/default.nix b/pkgs/development/rocm-modules/5/default.nix
index 8bc496f452f1..c4567410c385 100644
--- a/pkgs/development/rocm-modules/5/default.nix
+++ b/pkgs/development/rocm-modules/5/default.nix
@@ -59,6 +59,11 @@ in rec {
     stdenv = llvm.rocmClangStdenv;
   };
 
+  # Unfree
+  hsa-amd-aqlprofile-bin = callPackage ./hsa-amd-aqlprofile-bin {
+    stdenv = llvm.rocmClangStdenv;
+  };
+
   # Broken, too many errors
   rdc = callPackage ./rdc {
     inherit rocmUpdateScript rocm-smi rocm-runtime;
diff --git a/pkgs/development/rocm-modules/5/hsa-amd-aqlprofile-bin/default.nix b/pkgs/development/rocm-modules/5/hsa-amd-aqlprofile-bin/default.nix
new file mode 100644
index 000000000000..d13092fd3eef
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/hsa-amd-aqlprofile-bin/default.nix
@@ -0,0 +1,45 @@
+{ lib
+, stdenv
+, fetchurl
+, dpkg
+}:
+
+let
+  prefix = "hsa-amd-aqlprofile";
+  version = "5.7.0";
+  major = lib.versions.major version;
+  minor = lib.versions.minor version;
+  patch = lib.versions.patch version;
+  magic = lib.strings.concatStrings (lib.strings.intersperse "0" (lib.versions.splitVersion version));
+in stdenv.mkDerivation (finalAttrs: {
+  inherit version;
+  pname = "${prefix}-bin";
+
+  src = fetchurl {
+    url = "https://repo.radeon.com/rocm/apt/${major}.${minor}/pool/main/h/${prefix}/${prefix}_1.0.0.${magic}.${magic}-63~22.04_amd64.deb";
+    hash = "sha256-FQ25eXkhnvOmcf0sGW3GYu9kZj69bVvZrh0jVx/G/kI=";
+  };
+
+  nativeBuildInputs = [ dpkg ];
+  dontPatch = true;
+  dontConfigure = true;
+  dontBuild = true;
+
+  installPhase = ''
+    runHook preInstall
+
+    mkdir -p $out
+    cp -a opt/rocm-${version}/* $out
+
+    runHook postInstall
+  '';
+
+  meta = with lib; {
+    description = "AQLPROFILE library for AMD HSA runtime API extension support";
+    homepage = "https://rocm.docs.amd.com/en/latest/";
+    license = with licenses; [ unfree ];
+    maintainers = teams.rocm.members;
+    platforms = platforms.linux;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
+  };
+})

From ab01e7cd07ee0698400943ba7f15c3d7eb30d3d7 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Mon, 2 Oct 2023 06:12:00 -0500
Subject: [PATCH 15/30] rocmPackages.llvm: fixup for 5.7.0

rocmPackages.llvm.openmp: fixup for 5.7.0

rocmPackages.llvm.mlir: fixup for 5.7.0

rocmPackages.llvm.flang: fixup for 5.7.0
---
 pkgs/development/rocm-modules/5/default.nix   |   2 +-
 .../rocm-modules/5/llvm/default.nix           |   8 +-
 .../stage-3/1000-openmp-failing-tests.list    | 122 ++++++++++++++++++
 .../rocm-modules/5/llvm/stage-3/flang.nix     |  23 ++--
 .../rocm-modules/5/llvm/stage-3/mlir.nix      |  12 +-
 .../rocm-modules/5/llvm/stage-3/openmp.nix    |  21 ++-
 6 files changed, 160 insertions(+), 28 deletions(-)
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-3/1000-openmp-failing-tests.list

diff --git a/pkgs/development/rocm-modules/5/default.nix b/pkgs/development/rocm-modules/5/default.nix
index c4567410c385..be597edfe150 100644
--- a/pkgs/development/rocm-modules/5/default.nix
+++ b/pkgs/development/rocm-modules/5/default.nix
@@ -10,7 +10,7 @@ let
   rocmUpdateScript = callPackage ./update.nix { };
 in rec {
   ## RadeonOpenCompute ##
-  llvm = recurseIntoAttrs (callPackage ./llvm/default.nix { inherit rocmUpdateScript; });
+  llvm = recurseIntoAttrs (callPackage ./llvm/default.nix { inherit rocmUpdateScript rocm-device-libs rocm-runtime rocm-thunk clr; });
 
   rocm-core = callPackage ./rocm-core {
     inherit rocmUpdateScript;
diff --git a/pkgs/development/rocm-modules/5/llvm/default.nix b/pkgs/development/rocm-modules/5/llvm/default.nix
index 11f8241251e6..9226fb87802c 100644
--- a/pkgs/development/rocm-modules/5/llvm/default.nix
+++ b/pkgs/development/rocm-modules/5/llvm/default.nix
@@ -3,6 +3,10 @@
 , rocmUpdateScript
 , wrapBintoolsWith
 , overrideCC
+, rocm-device-libs
+, rocm-runtime
+, rocm-thunk
+, clr
 }:
 
 let
@@ -43,10 +47,10 @@ in rec {
   clang-tools-extra = callPackage ./stage-3/clang-tools-extra.nix { inherit rocmUpdateScript llvm clang-unwrapped; stdenv = rocmClangStdenv; };
   libclc = callPackage ./stage-3/libclc.nix { inherit rocmUpdateScript llvm clang; stdenv = rocmClangStdenv; };
   lldb = callPackage ./stage-3/lldb.nix { inherit rocmUpdateScript clang; stdenv = rocmClangStdenv; };
-  mlir = callPackage ./stage-3/mlir.nix { inherit rocmUpdateScript; stdenv = rocmClangStdenv; };
+  mlir = callPackage ./stage-3/mlir.nix { inherit rocmUpdateScript clr; stdenv = rocmClangStdenv; };
   polly = callPackage ./stage-3/polly.nix { inherit rocmUpdateScript; stdenv = rocmClangStdenv; };
   flang = callPackage ./stage-3/flang.nix { inherit rocmUpdateScript clang-unwrapped mlir; stdenv = rocmClangStdenv; };
-  openmp = callPackage ./stage-3/openmp.nix { inherit rocmUpdateScript llvm clang-unwrapped clang; stdenv = rocmClangStdenv; };
+  openmp = callPackage ./stage-3/openmp.nix { inherit rocmUpdateScript llvm clang-unwrapped clang rocm-device-libs rocm-runtime rocm-thunk; stdenv = rocmClangStdenv; };
 
   # Runtimes
   pstl = callPackage ./stage-3/pstl.nix { inherit rocmUpdateScript; stdenv = rocmClangStdenv; };
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/1000-openmp-failing-tests.list b/pkgs/development/rocm-modules/5/llvm/stage-3/1000-openmp-failing-tests.list
new file mode 100644
index 000000000000..e53b21b3c535
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/1000-openmp-failing-tests.list
@@ -0,0 +1,122 @@
+runtime/test/tasking/hidden_helper_task/gtid.cpp
+runtime/test/ompt/parallel/parallel_if0.c
+runtime/test/ompt/parallel/serialized.c
+runtime/test/ompt/teams/parallel_team.c
+runtime/test/ompt/teams/serial_teams.c
+runtime/test/ompt/teams/serialized.c
+runtime/test/ompt/teams/team.c
+libomptarget/test/api/assert.c
+libomptarget/test/api/omp_device_managed_memory.c
+libomptarget/test/api/omp_device_memory.c
+libomptarget/test/api/omp_get_device_num.c
+libomptarget/test/api/omp_host_pinned_memory.c
+libomptarget/test/api/omp_host_pinned_memory_alloc.c
+libomptarget/test/api/omp_target_memcpy_async1.c
+libomptarget/test/api/omp_target_memcpy_async2.c
+libomptarget/test/api/omp_target_memcpy_rect_async1.c
+libomptarget/test/api/omp_target_memcpy_rect_async2.c
+libomptarget/test/mapping/array_section_implicit_capture.c
+libomptarget/test/mapping/data_absent_at_exit.c
+libomptarget/test/mapping/data_member_ref.cpp
+libomptarget/test/mapping/declare_mapper_api.cpp
+libomptarget/test/mapping/declare_mapper_target.cpp
+libomptarget/test/mapping/declare_mapper_target_data.cpp
+libomptarget/test/mapping/declare_mapper_target_data_enter_exit.cpp
+libomptarget/test/mapping/firstprivate_aligned.cpp
+libomptarget/test/mapping/has_device_addr.cpp
+libomptarget/test/mapping/implicit_device_ptr.c
+libomptarget/test/mapping/is_device_ptr.cpp
+libomptarget/test/mapping/lambda_mapping.cpp
+libomptarget/test/mapping/low_alignment.c
+libomptarget/test/mapping/map_back_race.cpp
+libomptarget/test/mapping/power_of_two_alignment.c
+libomptarget/test/mapping/pr38704.c
+libomptarget/test/mapping/prelock.cpp
+libomptarget/test/mapping/present/target_data_at_exit.c
+libomptarget/test/mapping/private_mapping.c
+libomptarget/test/mapping/ptr_and_obj_motion.c
+libomptarget/test/mapping/reduction_implicit_map.cpp
+libomptarget/test/mapping/target_derefence_array_pointrs.cpp
+libomptarget/test/mapping/target_map_for_member_data.cpp
+libomptarget/test/mapping/target_update_array_extension.c
+libomptarget/test/mapping/target_use_device_addr.c
+libomptarget/test/offloading/atomic-compare-signedness.c
+libomptarget/test/offloading/bug47654.cpp
+libomptarget/test/offloading/bug49021.cpp
+libomptarget/test/offloading/bug49779.cpp
+libomptarget/test/offloading/bug50022.cpp
+libomptarget/test/offloading/bug51781.c
+libomptarget/test/offloading/bug51982.c
+libomptarget/test/offloading/bug53727.cpp
+libomptarget/test/offloading/complex_reduction.cpp
+libomptarget/test/offloading/cuda_no_devices.c
+libomptarget/test/offloading/d2d_memcpy.c
+libomptarget/test/offloading/dynamic_module.c
+libomptarget/test/offloading/dynamic_module_load.c
+libomptarget/test/offloading/global_constructor.cpp
+libomptarget/test/offloading/lone_target_exit_data.c
+libomptarget/test/offloading/memory_manager.cpp
+libomptarget/test/offloading/parallel_offloading_map.cpp
+libomptarget/test/offloading/static_linking.c
+libomptarget/test/offloading/std_complex_arithmetic.cpp
+libomptarget/test/offloading/target-teams-atomic.c
+libomptarget/test/offloading/target_constexpr_mapping.cpp
+libomptarget/test/offloading/target_critical_region.cpp
+libomptarget/test/offloading/target_depend_nowait.cpp
+libomptarget/test/offloading/target_nowait_target.cpp
+libomptarget/test/offloading/taskloop_offload_nowait.cpp
+libomptarget/test/offloading/test_libc.cpp
+libomptarget/test/ompt/veccopy.c
+libomptarget/test/ompt/veccopy_disallow_both.c
+libomptarget/test/ompt/veccopy_emi.c
+libomptarget/test/ompt/veccopy_emi_map.c
+libomptarget/test/ompt/veccopy_map.c
+libomptarget/test/ompt/veccopy_no_device_init.c
+libomptarget/test/ompt/veccopy_wrong_return.c
+libomptarget/test/api/is_initial_device.c
+libomptarget/test/mapping/declare_mapper_nested_default_mappers_array_subscript.cpp
+libomptarget/test/mapping/declare_mapper_nested_default_mappers_ptr_subscript.cpp
+libomptarget/test/mapping/declare_mapper_nested_default_mappers_var.cpp
+libomptarget/test/mapping/target_pointers_members_map.cpp
+libomptarget/test/api/omp_dynamic_shared_memory_mixed.c
+libomptarget/test/api/omp_env_vars.c
+libomptarget/test/api/omp_get_mapped_ptr.c
+libomptarget/test/api/omp_get_num_devices.c
+libomptarget/test/api/omp_get_num_devices_with_empty_target.c
+libomptarget/test/mapping/alloc_fail.c
+libomptarget/test/mapping/array_section_use_device_ptr.c
+libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
+libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
+libomptarget/test/mapping/declare_mapper_target_update.cpp
+libomptarget/test/mapping/delete_inf_refcount.c
+libomptarget/test/mapping/lambda_by_value.cpp
+libomptarget/test/mapping/ompx_hold/omp_target_disassociate_ptr.c
+libomptarget/test/mapping/ompx_hold/struct.c
+libomptarget/test/mapping/ompx_hold/target-data.c
+libomptarget/test/mapping/ompx_hold/target.c
+libomptarget/test/mapping/present/target.c
+libomptarget/test/mapping/present/target_array_extension.c
+libomptarget/test/mapping/present/target_data.c
+libomptarget/test/mapping/present/target_data_array_extension.c
+libomptarget/test/mapping/present/target_enter_data.c
+libomptarget/test/mapping/present/target_exit_data_delete.c
+libomptarget/test/mapping/present/target_exit_data_release.c
+libomptarget/test/mapping/present/target_update.c
+libomptarget/test/mapping/present/target_update_array_extension.c
+libomptarget/test/mapping/present/zero_length_array_section.c
+libomptarget/test/mapping/present/zero_length_array_section_exit.c
+libomptarget/test/mapping/target_data_array_extension_at_exit.c
+libomptarget/test/mapping/target_has_device_addr.c
+libomptarget/test/mapping/target_implicit_partial_map.c
+libomptarget/test/mapping/target_wrong_use_device_addr.c
+libomptarget/test/offloading/host_as_target.c
+libomptarget/test/offloading/info.c
+libomptarget/test/offloading/offloading_success.c
+libomptarget/test/offloading/offloading_success.cpp
+libomptarget/test/offloading/wtime.c
+libomptarget/test/unified_shared_memory/api.c
+libomptarget/test/unified_shared_memory/associate_ptr.c
+libomptarget/test/unified_shared_memory/close_enter_exit.c
+libomptarget/test/unified_shared_memory/close_manual.c
+libomptarget/test/unified_shared_memory/close_member.c
+libomptarget/test/unified_shared_memory/close_modifier.c
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/flang.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/flang.nix
index 7289602451db..f5b4649e3f4b 100644
--- a/pkgs/development/rocm-modules/5/llvm/stage-3/flang.nix
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/flang.nix
@@ -3,31 +3,26 @@
 , rocmUpdateScript
 , clang-unwrapped
 , mlir
+, graphviz
 , python3Packages
 }:
 
 callPackage ../base.nix rec {
   inherit stdenv rocmUpdateScript;
-  buildTests = false; # `Executable "flang1" doesn't exist!`
   targetName = "flang";
   targetDir = targetName;
-  extraNativeBuildInputs = [ python3Packages.sphinx-markdown-tables ];
+
+  extraNativeBuildInputs = [
+    graphviz
+    python3Packages.sphinx-markdown-tables
+  ];
+
   extraBuildInputs = [ mlir ];
 
   extraCMakeFlags = [
-    "-DCMAKE_POLICY_DEFAULT_CMP0116=NEW"
     "-DCLANG_DIR=${clang-unwrapped}/lib/cmake/clang"
-    "-DFLANG_INCLUDE_TESTS=OFF"
     "-DMLIR_TABLEGEN_EXE=${mlir}/bin/mlir-tblgen"
+    "-DCLANG_TABLEGEN_EXE=${clang-unwrapped}/bin/clang-tblgen"
+    "-DFLANG_INCLUDE_TESTS=OFF" # `The dependency target "Bye" of target ...`
   ];
-
-  extraPostPatch = ''
-    substituteInPlace test/CMakeLists.txt \
-      --replace "FileCheck" "" \
-      --replace "count" "" \
-      --replace "not" ""
-
-    substituteInPlace docs/CMakeLists.txt \
-      --replace "CLANG_TABLEGEN_EXE clang-tblgen" "CLANG_TABLEGEN_EXE ${clang-unwrapped}/bin/clang-tblgen"
-  '';
 }
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/mlir.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/mlir.nix
index 099622ca7cb8..0e0dcabec60c 100644
--- a/pkgs/development/rocm-modules/5/llvm/stage-3/mlir.nix
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/mlir.nix
@@ -1,8 +1,7 @@
 { stdenv
 , callPackage
 , rocmUpdateScript
-# , hip
-# , rocm-comgr
+, clr
 , vulkan-headers
 , vulkan-loader
 , glslang
@@ -16,10 +15,9 @@ callPackage ../base.nix rec {
   buildMan = false; # No man pages to build
   targetName = "mlir";
   targetDir = targetName;
-  # extraNativeBuildInputs = [ hip ];
+  extraNativeBuildInputs = [ clr ];
 
   extraBuildInputs = [
-    # rocm-comgr
     vulkan-headers
     vulkan-loader
     glslang
@@ -27,7 +25,6 @@ callPackage ../base.nix rec {
   ];
 
   extraCMakeFlags = [
-    "-DCMAKE_POLICY_DEFAULT_CMP0116=NEW"
     "-DMLIR_INCLUDE_DOCS=ON"
     "-DMLIR_INCLUDE_TESTS=ON"
     "-DMLIR_ENABLE_ROCM_RUNNER=ON"
@@ -41,6 +38,10 @@ callPackage ../base.nix rec {
     mkdir -p ../llvm/build/bin
     ln -s ${lit}/bin/lit ../llvm/build/bin/llvm-lit
 
+    # `add_library cannot create target "llvm_gtest" because an imported target with the same name already exists`
+    substituteInPlace CMakeLists.txt \
+      --replace "EXISTS \''${UNITTEST_DIR}/googletest/include/gtest/gtest.h" "FALSE"
+
     substituteInPlace test/CMakeLists.txt \
       --replace "FileCheck count not" "" \
       --replace "list(APPEND MLIR_TEST_DEPENDS mlir_rocm_runtime)" ""
@@ -64,4 +65,5 @@ callPackage ../base.nix rec {
 
   checkTargets = [ "check-${targetName}" ];
   requiredSystemFeatures = [ "big-parallel" ];
+  isBroken = true; # `DebugTranslation.cpp:139:10: error: no matching function for call to 'get'`
 }
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/openmp.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/openmp.nix
index faab6388835e..5fd7b6fd9aa3 100644
--- a/pkgs/development/rocm-modules/5/llvm/stage-3/openmp.nix
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/openmp.nix
@@ -5,39 +5,48 @@
 , llvm
 , clang
 , clang-unwrapped
-# , rocm-device-libs
-# , rocm-runtime
+, rocm-device-libs
+, rocm-runtime
+, rocm-thunk
 , perl
 , elfutils
+, libdrm
+, numactl
 , lit
 }:
 
 callPackage ../base.nix rec {
   inherit stdenv rocmUpdateScript;
-  buildTests = false; # Too many failures, most pass
   targetName = "openmp";
   targetDir = targetName;
   extraNativeBuildInputs = [ perl ];
 
   extraBuildInputs = [
-    # rocm-device-libs
-    # rocm-runtime
+    rocm-device-libs
+    rocm-runtime
+    rocm-thunk
     elfutils
+    libdrm
+    numactl
   ];
 
   extraCMakeFlags = [
     "-DCMAKE_MODULE_PATH=/build/source/llvm/cmake/modules" # For docs
     "-DCLANG_TOOL=${clang}/bin/clang"
     "-DCLANG_OFFLOAD_BUNDLER_TOOL=${clang-unwrapped}/bin/clang-offload-bundler"
+    "-DPACKAGER_TOOL=${clang-unwrapped}/bin/clang-offload-packager"
     "-DOPENMP_LLVM_TOOLS_DIR=${llvm}/bin"
     "-DOPENMP_LLVM_LIT_EXECUTABLE=${lit}/bin/.lit-wrapped"
-    # "-DDEVICELIBS_ROOT=${rocm-device-libs.src}"
+    "-DDEVICELIBS_ROOT=${rocm-device-libs.src}"
   ];
 
   extraPostPatch = ''
     # We can't build this target at the moment
     substituteInPlace libomptarget/DeviceRTL/CMakeLists.txt \
       --replace "gfx1010" ""
+
+    # No idea what's going on here...
+    cat ${./1000-openmp-failing-tests.list} | xargs -d \\n rm
   '';
 
   checkTargets = [ "check-${targetName}" ];

From 56f1d971abb970ff5f59be78e124ae8b2c35601b Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Wed, 4 Oct 2023 19:45:17 -0500
Subject: [PATCH 16/30] rocmPackages.llvm.mlir: fix upstream bug

---
 .../0000-mlir-fix-debugtranslation.patch      | 36 +++++++++++++++++++
 .../llvm/stage-3/1001-mlir-failing-tests.list | 11 ++++++
 .../rocm-modules/5/llvm/stage-3/mlir.nix      | 26 ++++----------
 3 files changed, 54 insertions(+), 19 deletions(-)
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-3/0000-mlir-fix-debugtranslation.patch
 create mode 100644 pkgs/development/rocm-modules/5/llvm/stage-3/1001-mlir-failing-tests.list

diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/0000-mlir-fix-debugtranslation.patch b/pkgs/development/rocm-modules/5/llvm/stage-3/0000-mlir-fix-debugtranslation.patch
new file mode 100644
index 000000000000..f4221a088136
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/0000-mlir-fix-debugtranslation.patch
@@ -0,0 +1,36 @@
+From f1d1e10ec7e1061bf0b90abbc1e298d9438a5e74 Mon Sep 17 00:00:00 2001
+From: Scott Linder <Scott.Linder@amd.com>
+Date: Mon, 11 Sep 2023 18:37:37 +0000
+Subject: [PATCH] [HeterogeneousDWARF] Update MLIR DI Metadata handling
+
+Pass a default DW_MSPACE_LLVM_none to satisfy new API
+
+Change-Id: I50df461f00b5510a715f55f61107122318102d22
+---
+ lib/Target/LLVMIR/DebugTranslation.cpp | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/lib/Target/LLVMIR/DebugTranslation.cpp b/lib/Target/LLVMIR/DebugTranslation.cpp
+index 2053f5bcef06aa6..635ee5d7e5fefdc 100644
+--- a/lib/Target/LLVMIR/DebugTranslation.cpp
++++ b/lib/Target/LLVMIR/DebugTranslation.cpp
+@@ -148,7 +148,8 @@ llvm::DIDerivedType *DebugTranslation::translateImpl(DIDerivedTypeAttr attr) {
+       /*File=*/nullptr, /*Line=*/0,
+       /*Scope=*/nullptr, translate(attr.getBaseType()), attr.getSizeInBits(),
+       attr.getAlignInBits(), attr.getOffsetInBits(),
+-      /*DWARFAddressSpace=*/std::nullopt, /*Flags=*/llvm::DINode::FlagZero);
++      /*DWARFAddressSpace=*/std::nullopt, llvm::dwarf::DW_MSPACE_LLVM_none,
++      /*Flags=*/llvm::DINode::FlagZero);
+ }
+ 
+ llvm::DIFile *DebugTranslation::translateImpl(DIFileAttr attr) {
+@@ -185,7 +186,8 @@ DebugTranslation::translateImpl(DILocalVariableAttr attr) {
+       llvmCtx, translate(attr.getScope()), getMDStringOrNull(attr.getName()),
+       translate(attr.getFile()), attr.getLine(), translate(attr.getType()),
+       attr.getArg(),
+-      /*Flags=*/llvm::DINode::FlagZero, attr.getAlignInBits(),
++      /*Flags=*/llvm::DINode::FlagZero, llvm::dwarf::DW_MSPACE_LLVM_none,
++      attr.getAlignInBits(),
+       /*Annotations=*/nullptr);
+ }
+ 
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/1001-mlir-failing-tests.list b/pkgs/development/rocm-modules/5/llvm/stage-3/1001-mlir-failing-tests.list
new file mode 100644
index 000000000000..0b3d2d22592d
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/1001-mlir-failing-tests.list
@@ -0,0 +1,11 @@
+./test/Target/LLVMIR/openmp-llvm.mlir
+./test/mlir-spirv-cpu-runner/double.mlir
+./test/mlir-spirv-cpu-runner/simple_add.mlir
+./test/mlir-vulkan-runner/addf.mlir
+./test/mlir-vulkan-runner/addi.mlir
+./test/mlir-vulkan-runner/addi8.mlir
+./test/mlir-vulkan-runner/mulf.mlir
+./test/mlir-vulkan-runner/smul_extended.mlir
+./test/mlir-vulkan-runner/subf.mlir
+./test/mlir-vulkan-runner/time.mlir
+./test/mlir-vulkan-runner/umul_extended.mlir
diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/mlir.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/mlir.nix
index 0e0dcabec60c..1b0bc29ea62b 100644
--- a/pkgs/development/rocm-modules/5/llvm/stage-3/mlir.nix
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/mlir.nix
@@ -15,6 +15,11 @@ callPackage ../base.nix rec {
   buildMan = false; # No man pages to build
   targetName = "mlir";
   targetDir = targetName;
+
+  # Fix `DebugTranslation.cpp:139:10: error: no matching function for call to 'get'`
+  # We patch at a different source root, so we modify the patch and include it locally
+  # https://github.com/RadeonOpenCompute/llvm-project/commit/f1d1e10ec7e1061bf0b90abbc1e298d9438a5e74.patch
+  extraPatches = [ ./0000-mlir-fix-debugtranslation.patch ];
   extraNativeBuildInputs = [ clr ];
 
   extraBuildInputs = [
@@ -34,28 +39,12 @@ callPackage ../base.nix rec {
   ];
 
   extraPostPatch = ''
-    chmod +w ../llvm
-    mkdir -p ../llvm/build/bin
-    ln -s ${lit}/bin/lit ../llvm/build/bin/llvm-lit
-
     # `add_library cannot create target "llvm_gtest" because an imported target with the same name already exists`
     substituteInPlace CMakeLists.txt \
       --replace "EXISTS \''${UNITTEST_DIR}/googletest/include/gtest/gtest.h" "FALSE"
 
-    substituteInPlace test/CMakeLists.txt \
-      --replace "FileCheck count not" "" \
-      --replace "list(APPEND MLIR_TEST_DEPENDS mlir_rocm_runtime)" ""
-
-    substituteInPlace lib/ExecutionEngine/CMakeLists.txt \
-      --replace "return()" ""
-
-    # Remove problematic tests
-    rm test/CAPI/execution_engine.c
-    rm test/Target/LLVMIR/llvmir-intrinsics.mlir
-    rm test/Target/LLVMIR/llvmir.mlir
-    rm test/Target/LLVMIR/openmp-llvm.mlir
-    rm test/mlir-cpu-runner/*.mlir
-    rm test/mlir-vulkan-runner/*.mlir
+    # Mainly `No such file or directory`
+    cat ${./1001-mlir-failing-tests.list} | xargs -d \\n rm
   '';
 
   extraPostInstall = ''
@@ -65,5 +54,4 @@ callPackage ../base.nix rec {
 
   checkTargets = [ "check-${targetName}" ];
   requiredSystemFeatures = [ "big-parallel" ];
-  isBroken = true; # `DebugTranslation.cpp:139:10: error: no matching function for call to 'get'`
 }

From 6da31e558ad4d02fa6412f3312e67d13a4003594 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Wed, 4 Oct 2023 20:53:51 -0500
Subject: [PATCH 17/30] rocmPackages.llvm.flang: mark broken due to error

---
 pkgs/development/rocm-modules/5/llvm/stage-3/flang.nix | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkgs/development/rocm-modules/5/llvm/stage-3/flang.nix b/pkgs/development/rocm-modules/5/llvm/stage-3/flang.nix
index f5b4649e3f4b..421663dcb1b7 100644
--- a/pkgs/development/rocm-modules/5/llvm/stage-3/flang.nix
+++ b/pkgs/development/rocm-modules/5/llvm/stage-3/flang.nix
@@ -25,4 +25,7 @@ callPackage ../base.nix rec {
     "-DCLANG_TABLEGEN_EXE=${clang-unwrapped}/bin/clang-tblgen"
     "-DFLANG_INCLUDE_TESTS=OFF" # `The dependency target "Bye" of target ...`
   ];
+
+  # `flang/lib/Semantics/check-omp-structure.cpp:1905:1: error: no member named 'v' in 'Fortran::parser::OmpClause::OmpxDynCgroupMem'`
+  isBroken = true;
 }

From 0a89aedcea092534a4e0e2e57672dd35d5c8f518 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Tue, 3 Oct 2023 20:42:42 -0500
Subject: [PATCH 18/30] rocmPackages: fixup for 5.7.0

rocmPackages.rocprim: fixup for 5.7.0

rocmPackages.rocsparse: fixup for 5.7.0

rocmPackages.rocthrust: fixup for 5.7.0

rocmPackages.rocrand: fixup for 5.7.0

rocmPackages.rocfft: fixup for 5.7.0

rocmPackages.rccl: fixup for 5.7.0

rocmPackages.hipcub: fixup for 5.7.0

rocmPackages.hipsparse: fixup for 5.7.0

rocmPackages.hipfort: fixup for 5.7.0

rocmPackages.hipfft: fixup for 5.7.0

rocmPackages.tensile: fixup for 5.7.0

rocmPackages.rocblas: fixup for 5.7.0

rocmPackages.rocsolver: fixup for 5.7.0

rocmPackages.rocwmma: fixup for 5.7.0

rocmPackages.rocalution: fixup for 5.7.0

rocmPackages.rocmlir: fixup for 5.7.0

rocmPackages.hipsolver: fixup for 5.7.0

rocmPackages.hipblas: fixup for 5.7.0

rocmPackages.miopengemm: fixup for 5.7.0

rocmPackages.miopen: fixup for 5.7.0

rocmPackages.migraphx: fixup for 5.7.0
---
 pkgs/development/rocm-modules/5/default.nix   | 117 ++++++++------
 .../rocm-modules/5/hipblas/default.nix        |   7 +-
 .../rocm-modules/5/hipcub/default.nix         |   9 +-
 .../rocm-modules/5/hipfft/default.nix         |  13 +-
 .../rocm-modules/5/hipsolver/default.nix      |   4 +-
 .../rocm-modules/5/hipsparse/default.nix      |  14 +-
 .../rocm-modules/5/migraphx/default.nix       |  28 +++-
 .../rocm-modules/5/miopen/default.nix         | 146 ++++++++++++------
 .../rocm-modules/5/miopen/deps.nix            |  45 ------
 .../rocm-modules/5/miopengemm/default.nix     |  21 +--
 .../rocm-modules/5/rccl/default.nix           |  20 ++-
 .../rocm-modules/5/rocalution/default.nix     |  10 +-
 .../rocm-modules/5/rocblas/default.nix        |   8 +-
 .../rocm-modules/5/rocfft/default.nix         |  94 ++---------
 .../5/rocfft/device-install.patch             |  15 --
 .../5/rocfft/split-kernel-compilation.patch   | 124 ---------------
 .../rocm-modules/5/rocmlir/default.nix        |  44 ++++--
 .../rocm-modules/5/rocprim/default.nix        |   8 +-
 .../rocm-modules/5/rocrand/default.nix        |   9 +-
 .../rocm-modules/5/rocsolver/default.nix      |   7 +-
 .../rocm-modules/5/rocsparse/default.nix      |   7 +-
 .../rocm-modules/5/rocthrust/default.nix      |   9 +-
 .../rocwmma/0000-dont-fetch-googletest.patch  |  12 +-
 .../rocm-modules/5/rocwmma/default.nix        |  55 +------
 .../rocm-modules/5/tensile/default.nix        |  25 ++-
 25 files changed, 357 insertions(+), 494 deletions(-)
 delete mode 100644 pkgs/development/rocm-modules/5/miopen/deps.nix
 delete mode 100644 pkgs/development/rocm-modules/5/rocfft/device-install.patch
 delete mode 100644 pkgs/development/rocm-modules/5/rocfft/split-kernel-compilation.patch

diff --git a/pkgs/development/rocm-modules/5/default.nix b/pkgs/development/rocm-modules/5/default.nix
index be597edfe150..d3effc777613 100644
--- a/pkgs/development/rocm-modules/5/default.nix
+++ b/pkgs/development/rocm-modules/5/default.nix
@@ -82,6 +82,7 @@ in rec {
     stdenv = llvm.rocmClangStdenv;
   };
 
+  # Replaces hip, opencl-runtime, and rocclr
   clr = callPackage ./clr {
     inherit rocmUpdateScript hip-common hipcc rocm-device-libs rocm-comgr rocm-runtime roctracer rocminfo;
     inherit (llvm) clang;
@@ -122,70 +123,92 @@ in rec {
     stdenv = llvm.rocmClangStdenv;
   };
 
+  ## ROCmSoftwarePlatform ##
+  rocprim = callPackage ./rocprim {
+    inherit rocmUpdateScript rocm-cmake clr;
+    stdenv = llvm.rocmClangStdenv;
+  };
 
+  rocsparse = callPackage ./rocsparse {
+    inherit rocmUpdateScript rocm-cmake rocprim clr;
+    stdenv = llvm.rocmClangStdenv;
+  };
 
+  rocthrust = callPackage ./rocthrust {
+    inherit rocmUpdateScript rocm-cmake rocprim clr;
+    stdenv = llvm.rocmClangStdenv;
+  };
 
+  rocrand = callPackage ./rocrand {
+    inherit rocmUpdateScript rocm-cmake clr;
+    stdenv = llvm.rocmClangStdenv;
+  };
 
+  hiprand = rocrand; # rocrand includes hiprand
 
+  rocfft = callPackage ./rocfft {
+    inherit rocmUpdateScript rocm-cmake rocrand rocfft clr;
+    inherit (llvm) openmp;
+    stdenv = llvm.rocmClangStdenv;
+  };
 
-
-
-
-
-
-
-
-  composable_kernel = callPackage ./composable_kernel {
-    inherit (llvm) openmp clang-tools-extra;
+  rccl = callPackage ./rccl {
+    inherit rocmUpdateScript rocm-cmake rocm-smi clr hipify;
     stdenv = llvm.rocmClangStdenv;
   };
 
   hipcub = callPackage ./hipcub {
+    inherit rocmUpdateScript rocm-cmake rocprim clr;
     stdenv = llvm.rocmClangStdenv;
   };
 
   hipsparse = callPackage ./hipsparse {
+    inherit rocmUpdateScript rocm-cmake rocsparse clr;
     inherit (llvm) openmp;
     stdenv = llvm.rocmClangStdenv;
   };
 
   hipfort = callPackage ./hipfort {
+    inherit rocmUpdateScript rocm-cmake;
     stdenv = llvm.rocmClangStdenv;
   };
 
   hipfft = callPackage ./hipfft {
+    inherit rocmUpdateScript rocm-cmake rocfft clr;
     inherit (llvm) openmp;
     stdenv = llvm.rocmClangStdenv;
   };
 
-  hipsolver = callPackage ./hipsolver {
+  tensile = python3Packages.callPackage ./tensile {
+    inherit rocmUpdateScript rocminfo;
     stdenv = llvm.rocmClangStdenv;
   };
 
-  hipblas = callPackage ./hipblas {
-    stdenv = llvm.rocmClangStdenv;
-  };
-
-  migraphx = callPackage ./migraphx {
-    inherit (llvm) clang-tools-extra openmp;
-    stdenv = llvm.rocmClangStdenv;
-    rocmlir = rocmlir-rock;
-  };
-
-  rccl = callPackage ./rccl {
-    stdenv = llvm.rocmClangStdenv;
-  };
-
-  rocalution = callPackage ./rocalution {
+  rocblas = callPackage ./rocblas {
+    inherit rocmUpdateScript rocm-cmake clr tensile;
     inherit (llvm) openmp;
     stdenv = llvm.rocmClangStdenv;
   };
 
   rocsolver = callPackage ./rocsolver {
+    inherit rocmUpdateScript rocm-cmake rocblas rocsparse clr;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocwmma = callPackage ./rocwmma {
+    inherit rocmUpdateScript rocm-cmake rocm-smi rocblas clr;
+    inherit (llvm) openmp;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
+  rocalution = callPackage ./rocalution {
+    inherit rocmUpdateScript rocm-cmake rocprim rocsparse rocrand rocblas clr;
+    inherit (llvm) openmp;
     stdenv = llvm.rocmClangStdenv;
   };
 
   rocmlir = callPackage ./rocmlir {
+    inherit rocmUpdateScript rocm-cmake clr;
     stdenv = llvm.rocmClangStdenv;
   };
 
@@ -193,47 +216,32 @@ in rec {
     buildRockCompiler = true;
   };
 
-  rocprim = callPackage ./rocprim {
+  hipsolver = callPackage ./hipsolver {
+    inherit rocmUpdateScript rocm-cmake rocblas rocsolver clr;
     stdenv = llvm.rocmClangStdenv;
   };
 
-  rocsparse = callPackage ./rocsparse {
+  hipblas = callPackage ./hipblas {
+    inherit rocmUpdateScript rocm-cmake rocblas rocsolver clr;
     stdenv = llvm.rocmClangStdenv;
   };
 
-  rocfft = callPackage ./rocfft {
-    inherit (llvm) openmp;
-    stdenv = llvm.rocmClangStdenv;
-  };
-
-  rocrand = callPackage ./rocrand {
-    stdenv = llvm.rocmClangStdenv;
-  };
-
-  tensile = python3Packages.callPackage ./tensile {
-    stdenv = llvm.rocmClangStdenv;
-  };
-
-  rocwmma = callPackage ./rocwmma {
-    inherit (llvm) openmp;
-    stdenv = llvm.rocmClangStdenv;
-  };
-
-  rocblas = callPackage ./rocblas {
-    inherit (llvm) openmp;
-    stdenv = llvm.rocmClangStdenv;
-  };
+  # hipBlasLt - Very broken with Tensile at the moment, only supports GFX9
+  # hipTensor - Only supports GFX9
 
   miopengemm = callPackage ./miopengemm {
+    inherit rocmUpdateScript rocm-cmake clr;
     stdenv = llvm.rocmClangStdenv;
   };
 
-  rocthrust = callPackage ./rocthrust {
+  composable_kernel = callPackage ./composable_kernel {
+    inherit (llvm) openmp clang-tools-extra;
     stdenv = llvm.rocmClangStdenv;
   };
 
   miopen = callPackage ./miopen {
-    inherit (llvm) llvm clang-tools-extra;
+    inherit rocmUpdateScript rocm-cmake rocblas clang-ocl miopengemm composable_kernel rocm-comgr clr rocm-docs-core half;
+    inherit (llvm) clang-tools-extra;
     stdenv = llvm.rocmClangStdenv;
     rocmlir = rocmlir-rock;
     boost = boost179.override { enableStatic = true; };
@@ -246,4 +254,11 @@ in rec {
   miopen-opencl = miopen.override {
     useOpenCL = true;
   };
+
+  migraphx = callPackage ./migraphx {
+    inherit rocmUpdateScript rocm-cmake rocblas composable_kernel miopengemm miopen clr half rocm-device-libs;
+    inherit (llvm) openmp clang-tools-extra;
+    stdenv = llvm.rocmClangStdenv;
+    rocmlir = rocmlir-rock;
+  };
 }
diff --git a/pkgs/development/rocm-modules/5/hipblas/default.nix b/pkgs/development/rocm-modules/5/hipblas/default.nix
index cb60e5de6633..b2206c737b00 100644
--- a/pkgs/development/rocm-modules/5/hipblas/default.nix
+++ b/pkgs/development/rocm-modules/5/hipblas/default.nix
@@ -4,7 +4,7 @@
 , rocmUpdateScript
 , cmake
 , rocm-cmake
-, hip
+, clr
 , gfortran
 , rocblas
 , rocsolver
@@ -40,7 +40,7 @@ stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     rocm-cmake
-    hip
+    clr
     gfortran
   ];
 
@@ -94,7 +94,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    # Fixed in develop branch by using C++17 and related refactor
-    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version || buildTests || buildBenchmarks || buildSamples;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/rocm-modules/5/hipcub/default.nix b/pkgs/development/rocm-modules/5/hipcub/default.nix
index b3a23241366f..447c2c4174af 100644
--- a/pkgs/development/rocm-modules/5/hipcub/default.nix
+++ b/pkgs/development/rocm-modules/5/hipcub/default.nix
@@ -5,11 +5,12 @@
 , cmake
 , rocm-cmake
 , rocprim
-, hip
+, clr
 , gtest
 , gbenchmark
 , buildTests ? false
 , buildBenchmarks ? false
+, gpuTargets ? [ ]
 }:
 
 # CUB can also be used as a backend instead of rocPRIM.
@@ -35,7 +36,7 @@ stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     rocm-cmake
-    hip
+    clr
   ];
 
   buildInputs = [
@@ -48,12 +49,14 @@ stdenv.mkDerivation (finalAttrs: {
 
   cmakeFlags = [
     "-DCMAKE_CXX_COMPILER=hipcc"
-    "-DHIP_ROOT_DIR=${hip}"
+    "-DHIP_ROOT_DIR=${clr}"
     # Manually define CMAKE_INSTALL_<DIR>
     # See: https://github.com/NixOS/nixpkgs/pull/197838
     "-DCMAKE_INSTALL_BINDIR=bin"
     "-DCMAKE_INSTALL_LIBDIR=lib"
     "-DCMAKE_INSTALL_INCLUDEDIR=include"
+  ] ++ lib.optionals (gpuTargets != [ ]) [
+    "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
   ] ++ lib.optionals buildTests [
     "-DBUILD_TEST=ON"
   ] ++ lib.optionals buildBenchmarks [
diff --git a/pkgs/development/rocm-modules/5/hipfft/default.nix b/pkgs/development/rocm-modules/5/hipfft/default.nix
index 1e959f0ad56a..153a7c8c18cc 100644
--- a/pkgs/development/rocm-modules/5/hipfft/default.nix
+++ b/pkgs/development/rocm-modules/5/hipfft/default.nix
@@ -4,7 +4,7 @@
 , rocmUpdateScript
 , cmake
 , rocm-cmake
-, hip
+, clr
 , git
 , rocfft
 , gtest
@@ -15,6 +15,7 @@
 , buildTests ? false
 , buildBenchmarks ? false
 , buildSamples ? false
+, gpuTargets ? [ ]
 }:
 
 # Can also use cuFFT
@@ -41,7 +42,7 @@ stdenv.mkDerivation (finalAttrs: {
   };
 
   nativeBuildInputs = [
-    hip
+    clr
     git
     cmake
     rocm-cmake
@@ -60,14 +61,16 @@ stdenv.mkDerivation (finalAttrs: {
   cmakeFlags = [
     "-DCMAKE_C_COMPILER=hipcc"
     "-DCMAKE_CXX_COMPILER=hipcc"
-    "-DCMAKE_MODULE_PATH=${hip}/lib/cmake/hip"
-    "-DHIP_ROOT_DIR=${hip}"
-    "-DHIP_PATH=${hip}"
+    "-DCMAKE_MODULE_PATH=${clr}/lib/cmake/hip"
+    "-DHIP_ROOT_DIR=${clr}"
+    "-DHIP_PATH=${clr}"
     # Manually define CMAKE_INSTALL_<DIR>
     # See: https://github.com/NixOS/nixpkgs/pull/197838
     "-DCMAKE_INSTALL_BINDIR=bin"
     "-DCMAKE_INSTALL_LIBDIR=lib"
     "-DCMAKE_INSTALL_INCLUDEDIR=include"
+  ] ++ lib.optionals (gpuTargets != [ ]) [
+    "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
   ] ++ lib.optionals buildTests [
     "-DBUILD_CLIENTS_TESTS=ON"
   ] ++ lib.optionals buildBenchmarks [
diff --git a/pkgs/development/rocm-modules/5/hipsolver/default.nix b/pkgs/development/rocm-modules/5/hipsolver/default.nix
index 238564d631bc..34592a5bbd96 100644
--- a/pkgs/development/rocm-modules/5/hipsolver/default.nix
+++ b/pkgs/development/rocm-modules/5/hipsolver/default.nix
@@ -4,7 +4,7 @@
 , rocmUpdateScript
 , cmake
 , rocm-cmake
-, hip
+, clr
 , gfortran
 , rocblas
 , rocsolver
@@ -40,7 +40,7 @@ stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     rocm-cmake
-    hip
+    clr
     gfortran
   ];
 
diff --git a/pkgs/development/rocm-modules/5/hipsparse/default.nix b/pkgs/development/rocm-modules/5/hipsparse/default.nix
index 6e6197209e4b..79b78f3661d8 100644
--- a/pkgs/development/rocm-modules/5/hipsparse/default.nix
+++ b/pkgs/development/rocm-modules/5/hipsparse/default.nix
@@ -5,13 +5,14 @@
 , cmake
 , rocm-cmake
 , rocsparse
-, hip
+, clr
 , gfortran
 , git
 , gtest
 , openmp
 , buildTests ? false
 , buildSamples ? false
+, gpuTargets ? [ ]
 }:
 
 # This can also use cuSPARSE as a backend instead of rocSPARSE
@@ -37,7 +38,7 @@ stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     rocm-cmake
-    hip
+    clr
     gfortran
   ];
 
@@ -59,16 +60,15 @@ stdenv.mkDerivation (finalAttrs: {
     "-DCMAKE_INSTALL_BINDIR=bin"
     "-DCMAKE_INSTALL_LIBDIR=lib"
     "-DCMAKE_INSTALL_INCLUDEDIR=include"
+  ] ++ lib.optionals (gpuTargets != [ ]) [
+    "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
   ] ++ lib.optionals buildTests [
     "-DBUILD_CLIENTS_TESTS=ON"
   ];
 
   # We have to manually generate the matrices
   # CMAKE_MATRICES_DIR seems to be reset in clients/tests/CMakeLists.txt
-  postPatch = ''
-    substituteInPlace clients/common/utility.cpp \
-      --replace "#ifdef __cpp_lib_filesystem" " #if true"
-  '' + lib.optionalString buildTests ''
+  postPatch = lib.optionalString buildTests ''
     mkdir -p matrices
 
     ln -s ${rocsparse.passthru.matrices.matrix-01}/*.mtx matrices
@@ -116,7 +116,7 @@ stdenv.mkDerivation (finalAttrs: {
     mkdir -p $sample/bin
     mv clients/staging/example_* $sample/bin
     patchelf --set-rpath $out/lib:${lib.makeLibraryPath (
-      finalAttrs.buildInputs ++ [ hip gfortran.cc ])} $sample/bin/example_*
+      finalAttrs.buildInputs ++ [ clr gfortran.cc ])} $sample/bin/example_*
   '';
 
   passthru.updateScript = rocmUpdateScript {
diff --git a/pkgs/development/rocm-modules/5/migraphx/default.nix b/pkgs/development/rocm-modules/5/migraphx/default.nix
index fdc97f45da9d..5842cd1695d5 100644
--- a/pkgs/development/rocm-modules/5/migraphx/default.nix
+++ b/pkgs/development/rocm-modules/5/migraphx/default.nix
@@ -5,11 +5,12 @@
 , pkg-config
 , cmake
 , rocm-cmake
-, hip
+, clr
 , clang-tools-extra
 , openmp
 , rocblas
 , rocmlir
+, composable_kernel
 , miopengemm
 , miopen
 , protobuf
@@ -19,6 +20,8 @@
 , sqlite
 , oneDNN_2
 , blaze
+, cppcheck
+, rocm-device-libs
 , texlive
 , doxygen
 , sphinx
@@ -67,7 +70,7 @@ in stdenv.mkDerivation (finalAttrs: {
     pkg-config
     cmake
     rocm-cmake
-    hip
+    clr
     clang-tools-extra
     python3Packages.python
   ] ++ lib.optionals buildDocs [
@@ -84,6 +87,7 @@ in stdenv.mkDerivation (finalAttrs: {
     openmp
     rocblas
     rocmlir
+    composable_kernel
     miopengemm
     miopen
     protobuf
@@ -93,18 +97,16 @@ in stdenv.mkDerivation (finalAttrs: {
     sqlite
     oneDNN_2
     blaze
+    cppcheck
     python3Packages.pybind11
     python3Packages.onnx
   ];
 
   cmakeFlags = [
-    "-DCMAKE_POLICY_DEFAULT_CMP0079=NEW"
-    # "-DCMAKE_C_COMPILER=hipcc"
-    # "-DCMAKE_CXX_COMPILER=hipcc"
-    "-DMIGRAPHX_ENABLE_GPU=OFF" # GPU compilation is broken, don't know why
+    "-DMIGRAPHX_ENABLE_GPU=ON"
     "-DMIGRAPHX_ENABLE_CPU=ON"
     "-DMIGRAPHX_ENABLE_FPGA=ON"
-    "-DMIGRAPHX_ENABLE_MLIR=ON"
+    "-DMIGRAPHX_ENABLE_MLIR=OFF" # LLVM or rocMLIR mismatch?
     # Manually define CMAKE_INSTALL_<DIR>
     # See: https://github.com/NixOS/nixpkgs/pull/197838
     "-DCMAKE_INSTALL_BINDIR=bin"
@@ -113,10 +115,20 @@ in stdenv.mkDerivation (finalAttrs: {
   ];
 
   postPatch = ''
+    # We need to not use hipcc and define the CXXFLAGS manually due to `undefined hidden symbol: tensorflow:: ...`
+    export CXXFLAGS+="--rocm-path=${clr} --rocm-device-lib-path=${rocm-device-libs}/amdgcn/bitcode"
     patchShebangs tools
 
+    # `error: '__clang_hip_runtime_wrapper.h' file not found [clang-diagnostic-error]`
+    substituteInPlace CMakeLists.txt \
+      --replace "set(MIGRAPHX_TIDY_ERRORS ALL)" ""
+
+    # JIT library was removed from composable_kernel...
+    # https://github.com/ROCmSoftwarePlatform/composable_kernel/issues/782
     substituteInPlace src/targets/gpu/CMakeLists.txt \
-      --replace "CMAKE_CXX_COMPILER MATCHES \".*clang\\\+\\\+\$\"" "TRUE"
+      --replace " COMPONENTS jit_library" "" \
+      --replace " composable_kernel::jit_library" "" \
+      --replace "if(WIN32)" "if(TRUE)"
   '' + lib.optionalString (!buildDocs) ''
     substituteInPlace CMakeLists.txt \
       --replace "add_subdirectory(doc)" ""
diff --git a/pkgs/development/rocm-modules/5/miopen/default.nix b/pkgs/development/rocm-modules/5/miopen/default.nix
index dd7661a09756..d22518aa51c6 100644
--- a/pkgs/development/rocm-modules/5/miopen/default.nix
+++ b/pkgs/development/rocm-modules/5/miopen/default.nix
@@ -1,23 +1,26 @@
 { lib
 , stdenv
 , fetchFromGitHub
-, fetchurl
+, fetchpatch
 , rocmUpdateScript
+, runCommand
 , pkg-config
 , cmake
 , rocm-cmake
 , rocblas
 , rocmlir
-, hip
+, clr
 , clang-tools-extra
 , clang-ocl
-, llvm
 , miopengemm
 , composable_kernel
+, frugally-deep
+, rocm-docs-core
 , half
 , boost
 , sqlite
 , bzip2
+, lbzip2
 , nlohmann_json
 , texlive
 , doxygen
@@ -26,13 +29,48 @@
 , gtest
 , rocm-comgr
 , python3Packages
-, buildDocs ? true
+, buildDocs ? false # Needs internet because of rocm-docs-core
 , buildTests ? false
-, fetchKDBs ? true
 , useOpenCL ? false
 }:
 
 let
+  version = "5.7.0";
+
+  src = fetchFromGitHub {
+    owner = "ROCmSoftwarePlatform";
+    repo = "MIOpen";
+    rev = "rocm-${version}";
+    hash = "sha256-xcKmFI8HcRA9bbh6EQGElKykIQ3RJX/q5f4IxXvM1Is=";
+    fetchLFS = true;
+    leaveDotGit = true;
+
+    # If you're reading this, it's gonna take a bit of time.
+    # fetchSubModules doesn't work with postFetch???
+    # fetchLFS isn't actually fetching the LFS files...
+    postFetch = ''
+      export HOME=$(mktemp -d)
+      cd $out
+
+      # We need more history to fetch LFS files
+      git remote add origin $url
+      git fetch origin
+      git clean -fdx
+      git checkout rocm-${version}
+
+      # We need to do this manually since using leaveDotGit and fetchSubmodules errors
+      git submodule update --init
+
+      # Fetch the LFS files
+      git lfs install
+      git lfs fetch --all
+      git lfs checkout
+
+      # Remove the defunct .git folder
+      rm -rf .git
+    '';
+  };
+
   latex = lib.optionalAttrs buildDocs texlive.combine {
     inherit (texlive) scheme-small
     latexmk
@@ -47,13 +85,40 @@ let
     titlesec;
   };
 
-  kdbs = lib.optionalAttrs fetchKDBs import ./deps.nix {
-    inherit fetchurl;
-    mirror = "https://repo.radeon.com/rocm/miopen-kernel/rel-5.0";
-  };
+  gfx900 = runCommand "miopen-gfx900.kdb" { preferLocalBuild = true; } ''
+    ${lbzip2}/bin/lbzip2 -ckd ${src}/src/kernels/gfx900.kdb.bz2 > $out
+  '';
+
+  gfx906 = runCommand "miopen-gfx906.kdb" { preferLocalBuild = true; } ''
+    ${lbzip2}/bin/lbzip2 -ckd ${src}/src/kernels/gfx906.kdb.bz2 > $out
+  '';
+
+  gfx908 = runCommand "miopen-gfx908.kdb" { preferLocalBuild = true; } ''
+    ${lbzip2}/bin/lbzip2 -ckd ${src}/src/kernels/gfx908.kdb.bz2 > $out
+  '';
+
+  gfx90a = runCommand "miopen-gfx90a.kdb" { preferLocalBuild = true; } ''
+    ${lbzip2}/bin/lbzip2 -ckd ${src}/src/kernels/gfx90a.kdb.bz2 > $out
+  '';
+
+  gfx1030 = runCommand "miopen-gfx1030.kdb" { preferLocalBuild = true; } ''
+    ${lbzip2}/bin/lbzip2 -ckd ${src}/src/kernels/gfx1030.kdb.bz2 > $out
+  '';
 in stdenv.mkDerivation (finalAttrs: {
+  inherit version src;
   pname = "miopen";
-  version = "5.7.0";
+
+  # Find zstd and add to target. Mainly for torch.
+  patches = [
+    (fetchpatch {
+      url = "https://github.com/ROCmSoftwarePlatform/MIOpen/commit/e608b4325646afeabb5e52846997b926d2019d19.patch";
+      hash = "sha256-oxa3qlIC2bzbwGxrQOZXoY/S7CpLsMrnWRB7Og0tk0M=";
+    })
+    (fetchpatch {
+      url = "https://github.com/ROCmSoftwarePlatform/MIOpen/commit/3413d2daaeb44b7d6eadcc03033a5954a118491e.patch";
+      hash = "sha256-ST4snUcTmmSI1Ogx815KEX9GdMnmubsavDzXCGJkiKs=";
+    })
+  ];
 
   outputs = [
     "out"
@@ -63,23 +128,15 @@ in stdenv.mkDerivation (finalAttrs: {
     "test"
   ];
 
-  src = fetchFromGitHub {
-    owner = "ROCmSoftwarePlatform";
-    repo = "MIOpen";
-    rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-6Bz4yDbQtV8XlEpwbH0YsJFaaZqH7BOfZDL7F4JTS1Q=";
-  };
-
   nativeBuildInputs = [
     pkg-config
     cmake
     rocm-cmake
-    hip
+    clr
     clang-tools-extra
   ];
 
   buildInputs = [
-    llvm
     rocblas
     rocmlir
     clang-ocl
@@ -90,10 +147,12 @@ in stdenv.mkDerivation (finalAttrs: {
     sqlite
     bzip2
     nlohmann_json
+    frugally-deep
   ] ++ lib.optionals buildDocs [
     latex
     doxygen
     sphinx
+    rocm-docs-core
     python3Packages.sphinx-rtd-theme
     python3Packages.breathe
     python3Packages.myst-parser
@@ -102,7 +161,9 @@ in stdenv.mkDerivation (finalAttrs: {
   ];
 
   cmakeFlags = [
+    "-DCMAKE_CXX_FLAGS=-Wno-#warnings" # <half> -> <half/half.hpp>
     "-DMIOPEN_USE_MIOPENGEMM=ON"
+    "-DUNZIPPER=${bzip2}/bin/bunzip2"
     # Manually define CMAKE_INSTALL_<DIR>
     # See: https://github.com/NixOS/nixpkgs/pull/197838
     "-DCMAKE_INSTALL_BINDIR=bin"
@@ -117,58 +178,47 @@ in stdenv.mkDerivation (finalAttrs: {
   ] ++ lib.optionals buildTests [
     "-DBUILD_TESTS=ON"
     "-DMIOPEN_TEST_ALL=ON"
-    "-DMIOPEN_TEST_GFX900=ON"
-    "-DMIOPEN_TEST_GFX906=ON"
-    "-DMIOPEN_TEST_GFX908=ON"
-    "-DMIOPEN_TEST_GFX90A=ON"
-    "-DMIOPEN_TEST_GFX103X=ON"
-    "-DGOOGLETEST_DIR=${gtest.src}" # Custom linker names
   ];
 
   postPatch = ''
+    patchShebangs test src/composable_kernel fin utils install_deps.cmake
+
     substituteInPlace CMakeLists.txt \
-      --replace "enable_testing()" "" \
+      --replace "unpack_db(\"\''${CMAKE_SOURCE_DIR}/src/kernels/\''${FILE_NAME}.kdb.bz2\")" "" \
       --replace "MIOPEN_HIP_COMPILER MATCHES \".*clang\\\\+\\\\+$\"" "true" \
       --replace "set(MIOPEN_TIDY_ERRORS ALL)" "" # error: missing required key 'key'
-  '' + lib.optionalString buildTests ''
+
     substituteInPlace test/gtest/CMakeLists.txt \
-      --replace "enable_testing()" ""
-  '' + lib.optionalString (!buildTests) ''
-    substituteInPlace CMakeLists.txt \
-      --replace "add_subdirectory(test)" ""
-  '' + lib.optionalString fetchKDBs ''
-    ln -sf ${kdbs.gfx1030_36} src/kernels/gfx1030_36.kdb
-    ln -sf ${kdbs.gfx900_56} src/kernels/gfx900_56.kdb
-    ln -sf ${kdbs.gfx900_64} src/kernels/gfx900_64.kdb
-    ln -sf ${kdbs.gfx906_60} src/kernels/gfx906_60.kdb
-    ln -sf ${kdbs.gfx906_64} src/kernels/gfx906_64.kdb
-    ln -sf ${kdbs.gfx90878} src/kernels/gfx90878.kdb
-    ln -sf ${kdbs.gfx90a68} src/kernels/gfx90a68.kdb
-    ln -sf ${kdbs.gfx90a6e} src/kernels/gfx90a6e.kdb
+      --replace "include(googletest)" ""
+
+    ln -sf ${gfx900} src/kernels/gfx900.kdb
+    ln -sf ${gfx906} src/kernels/gfx906.kdb
+    ln -sf ${gfx908} src/kernels/gfx908.kdb
+    ln -sf ${gfx90a} src/kernels/gfx90a.kdb
+    ln -sf ${gfx1030} src/kernels/gfx1030.kdb
   '';
 
   # Unfortunately, it seems like we have to call make on these manually
   postBuild = lib.optionalString buildDocs ''
-    export HOME=$(mktemp -d)
-    make -j$NIX_BUILD_CORES doc
+    python -m sphinx -T -E -b html -d _build/doctrees -D language=en ../docs _build/html
   '' + lib.optionalString buildTests ''
     make -j$NIX_BUILD_CORES check
   '';
 
   postInstall = ''
     rm $out/bin/install_precompiled_kernels.sh
+    ln -sf ${gfx900} $out/share/miopen/db/gfx900.kdb
+    ln -sf ${gfx906} $out/share/miopen/db/gfx906.kdb
+    ln -sf ${gfx908} $out/share/miopen/db/gfx908.kdb
+    ln -sf ${gfx90a} $out/share/miopen/db/gfx90a.kdb
+    ln -sf ${gfx1030} $out/share/miopen/db/gfx1030.kdb
   '' + lib.optionalString buildDocs ''
     mv ../doc/html $out/share/doc/miopen-${if useOpenCL then "opencl" else "hip"}
-    mv ../doc/pdf/miopen.pdf $out/share/doc/miopen-${if useOpenCL then "opencl" else "hip"}
   '' + lib.optionalString buildTests ''
     mkdir -p $test/bin
     mv bin/test_* $test/bin
     patchelf --set-rpath $out/lib:${lib.makeLibraryPath (finalAttrs.buildInputs ++
-      [ hip rocm-comgr ])} $test/bin/*
-  '' + lib.optionalString fetchKDBs ''
-    # Apparently gfx1030_40 wasn't generated so the developers suggest just renaming gfx1030_36 to it
-    # Should be fixed in the next miopen kernel generation batch
-    ln -s ${kdbs.gfx1030_36} $out/share/miopen/db/gfx1030_40.kdb
+      [ clr rocm-comgr ])} $test/bin/*
   '';
 
   requiredSystemFeatures = [ "big-parallel" ];
diff --git a/pkgs/development/rocm-modules/5/miopen/deps.nix b/pkgs/development/rocm-modules/5/miopen/deps.nix
deleted file mode 100644
index e88b61ad974c..000000000000
--- a/pkgs/development/rocm-modules/5/miopen/deps.nix
+++ /dev/null
@@ -1,45 +0,0 @@
-{ fetchurl
-, mirror
-}:
-
-{
-  gfx1030_36 = fetchurl {
-    sha256 = "sha256-zEXDLkRWAHS15LDA5IRyqG5rO7HHPBiVgPlQ8JjSqNc=";
-    url = "${mirror}/gfx1030_36.kdb";
-  };
-
-  gfx900_56 = fetchurl {
-    sha256 = "sha256-ZTqUPhVKcQzjO6bxykvZMJk1VZh31dRVs+XqcxEtmeI=";
-    url = "${mirror}/gfx900_56.kdb";
-  };
-
-  gfx900_64 = fetchurl {
-    sha256 = "sha256-ZTqUPhVKcQzjO6bxykvZMJk1VZh31dRVs+XqcxEtmeI=";
-    url = "${mirror}/gfx900_64.kdb";
-  };
-
-  gfx906_60 = fetchurl {
-    sha256 = "sha256-U6pDo8ICfs6fVIEqRziWeE5/4Vzvu41JkcRVn3ou1e4=";
-    url = "${mirror}/gfx906_60.kdb";
-  };
-
-  gfx906_64 = fetchurl {
-    sha256 = "sha256-U6pDo8ICfs6fVIEqRziWeE5/4Vzvu41JkcRVn3ou1e4=";
-    url = "${mirror}/gfx906_64.kdb";
-  };
-
-  gfx90878 = fetchurl {
-    sha256 = "sha256-r7DRhNH+jHUXAu64b9vWsZzGD4w5oSHnxH0l2RN0qlQ=";
-    url = "${mirror}/gfx90878.kdb";
-  };
-
-  gfx90a68 = fetchurl {
-    sha256 = "sha256-NT//zIPTbzsPJyaVycxwU6BcMTzGc/d+Z4Ab9FImDko=";
-    url = "${mirror}/gfx90a68.kdb";
-  };
-
-  gfx90a6e = fetchurl {
-    sha256 = "sha256-ENZHbf+/MGYgSTpALKh2meuZPNhH5bG+WrW/jzvGpBs=";
-    url = "${mirror}/gfx90a6e.kdb";
-  };
-}
diff --git a/pkgs/development/rocm-modules/5/miopengemm/default.nix b/pkgs/development/rocm-modules/5/miopengemm/default.nix
index 92616f79eedc..bda94cee61b3 100644
--- a/pkgs/development/rocm-modules/5/miopengemm/default.nix
+++ b/pkgs/development/rocm-modules/5/miopengemm/default.nix
@@ -4,7 +4,8 @@
 , rocmUpdateScript
 , cmake
 , rocm-cmake
-, rocm-opencl-runtime
+, clr
+, clblast
 , texlive
 , doxygen
 , sphinx
@@ -53,11 +54,10 @@ in stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     rocm-cmake
+    clr
   ];
 
-  buildInputs = [
-    rocm-opencl-runtime
-  ] ++ lib.optionals buildDocs [
+  buildInputs = lib.optionals buildDocs [
     latex
     doxygen
     sphinx
@@ -65,6 +65,9 @@ in stdenv.mkDerivation (finalAttrs: {
     python3Packages.breathe
   ] ++ lib.optionals buildTests [
     openblas
+  ] ++ lib.optionals buildBenchmarks [
+    clblast
+    python3Packages.openai-triton
   ];
 
   cmakeFlags = [
@@ -77,10 +80,8 @@ in stdenv.mkDerivation (finalAttrs: {
     "-DOPENBLAS=ON"
   ] ++ lib.optionals buildBenchmarks [
     "-DAPI_BENCH_MIOGEMM=ON"
-    # Needs https://github.com/CNugteren/CLBlast
-    # "-DAPI_BENCH_CLBLAST=ON"
-    # Needs https://github.com/openai/triton
-    # "-DAPI_BENCH_ISAAC=ON"
+    "-DAPI_BENCH_CLBLAST=ON"
+    "-DAPI_BENCH_ISAAC=ON"
   ];
 
   # Unfortunately, it seems like we have to call make on these manually
@@ -118,6 +119,8 @@ in stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
+    # They are not making tags or releases, this may break other derivations in the future
+    # Use version major instead of minor, 6.0 will HOPEFULLY have a release or tag
+    broken = versions.major finalAttrs.version != versions.major stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/rocm-modules/5/rccl/default.nix b/pkgs/development/rocm-modules/5/rccl/default.nix
index 68dafc29c164..d4045252bae4 100644
--- a/pkgs/development/rocm-modules/5/rccl/default.nix
+++ b/pkgs/development/rocm-modules/5/rccl/default.nix
@@ -5,10 +5,13 @@
 , cmake
 , rocm-cmake
 , rocm-smi
-, hip
+, clr
+, perl
+, hipify
 , gtest
 , chrpath
 , buildTests ? false
+, gpuTargets ? [ ]
 }:
 
 stdenv.mkDerivation (finalAttrs: {
@@ -31,7 +34,9 @@ stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     rocm-cmake
-    hip
+    clr
+    perl
+    hipify
   ];
 
   buildInputs = [
@@ -42,22 +47,25 @@ stdenv.mkDerivation (finalAttrs: {
   ];
 
   cmakeFlags = [
-    "-DCMAKE_C_COMPILER=hipcc"
     "-DCMAKE_CXX_COMPILER=hipcc"
+    "-DBUILD_BFD=OFF" # Can't get it to detect bfd.h
     # Manually define CMAKE_INSTALL_<DIR>
     # See: https://github.com/NixOS/nixpkgs/pull/197838
     "-DCMAKE_INSTALL_BINDIR=bin"
     "-DCMAKE_INSTALL_LIBDIR=lib"
     "-DCMAKE_INSTALL_INCLUDEDIR=include"
+  ] ++ lib.optionals (gpuTargets != [ ]) [
+    "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
   ] ++ lib.optionals buildTests [
     "-DBUILD_TESTS=ON"
   ];
 
-  # Replace the manually set parallel jobs to NIX_BUILD_CORES
   postPatch = ''
+    patchShebangs src tools
+
+    # Really strange behavior, `#!/usr/bin/env perl` should work...
     substituteInPlace CMakeLists.txt \
-      --replace "8 P" "$NIX_BUILD_CORES P" \
-      --replace "8)" "$NIX_BUILD_CORES)"
+      --replace "\''$ \''${hipify-perl_executable}" "${perl}/bin/perl ${hipify}/bin/hipify-perl"
   '';
 
   postInstall = lib.optionalString buildTests ''
diff --git a/pkgs/development/rocm-modules/5/rocalution/default.nix b/pkgs/development/rocm-modules/5/rocalution/default.nix
index 650e9dc7a1ca..80fd655557df 100644
--- a/pkgs/development/rocm-modules/5/rocalution/default.nix
+++ b/pkgs/development/rocm-modules/5/rocalution/default.nix
@@ -8,7 +8,7 @@
 , rocsparse
 , rocprim
 , rocrand
-, hip
+, clr
 , git
 , openmp
 , openmpi
@@ -43,7 +43,7 @@ stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     rocm-cmake
-    hip
+    clr
     git
   ];
 
@@ -60,8 +60,8 @@ stdenv.mkDerivation (finalAttrs: {
 
   cmakeFlags = [
     "-DCMAKE_CXX_COMPILER=hipcc"
-    "-DROCM_PATH=${hip}"
-    "-DHIP_ROOT_DIR=${hip}"
+    "-DROCM_PATH=${clr}"
+    "-DHIP_ROOT_DIR=${clr}"
     "-DSUPPORT_HIP=ON"
     "-DSUPPORT_OMP=ON"
     "-DSUPPORT_MPI=ON"
@@ -92,7 +92,7 @@ stdenv.mkDerivation (finalAttrs: {
     rm $sample/bin/rocalution-bench || true
 
     patchelf --set-rpath \
-      $out/lib:${lib.makeLibraryPath (finalAttrs.buildInputs ++ [ hip ])} \
+      $out/lib:${lib.makeLibraryPath (finalAttrs.buildInputs ++ [ clr ])} \
       $sample/bin/*
   '' + lib.optionalString (buildTests || buildBenchmarks) ''
     rmdir $out/bin
diff --git a/pkgs/development/rocm-modules/5/rocblas/default.nix b/pkgs/development/rocm-modules/5/rocblas/default.nix
index 59d23ad121da..f1cd81df663f 100644
--- a/pkgs/development/rocm-modules/5/rocblas/default.nix
+++ b/pkgs/development/rocm-modules/5/rocblas/default.nix
@@ -4,7 +4,7 @@
 , rocmUpdateScript
 , cmake
 , rocm-cmake
-, hip
+, clr
 , python3
 , tensile
 , msgpack
@@ -18,13 +18,14 @@
 , buildTests ? false
 , buildBenchmarks ? false
 , tensileLogic ? "asm_full"
-, tensileCOVersion ? "V3"
+, tensileCOVersion ? "default"
 , tensileSepArch ? true
 , tensileLazyLib ? true
 , tensileLibFormat ? "msgpack"
 , gpuTargets ? [ "all" ]
 }:
 
+# rocBLAS is 3.7GB... I'll have to figure out hydra in another PR
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocblas";
   version = "5.7.0";
@@ -47,7 +48,7 @@ stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     rocm-cmake
-    hip
+    clr
   ];
 
   buildInputs = [
@@ -56,6 +57,7 @@ stdenv.mkDerivation (finalAttrs: {
     msgpack
     libxml2
     python3Packages.msgpack
+    python3Packages.joblib
   ] ++ lib.optionals buildTests [
     gtest
   ] ++ lib.optionals (buildTests || buildBenchmarks) [
diff --git a/pkgs/development/rocm-modules/5/rocfft/default.nix b/pkgs/development/rocm-modules/5/rocfft/default.nix
index ee1078eabb28..ac50318622ce 100644
--- a/pkgs/development/rocm-modules/5/rocfft/default.nix
+++ b/pkgs/development/rocm-modules/5/rocfft/default.nix
@@ -4,7 +4,7 @@
 , fetchFromGitHub
 , rocmUpdateScript
 , cmake
-, hip
+, clr
 , python3
 , rocm-cmake
 , sqlite
@@ -14,73 +14,9 @@
 , gtest
 , openmp
 , rocrand
-# NOTE: Update the default GPU targets on every update
-, gpuTargets ? [
-  "gfx803"
-  "gfx900"
-  "gfx906"
-  "gfx908"
-  "gfx90a"
-  "gfx1030"
-  "gfx1100"
-  "gfx1102"
-]
+, gpuTargets ? [ ]
 }:
 
-let
-  # To avoid output limit exceeded errors in hydra, we build kernel
-  # device libs and the kernel RTC cache database in separate derivations
-  kernelDeviceLibs = map
-    (target:
-      (rocfft.overrideAttrs (prevAttrs: {
-        pname = "rocfft-device-${target}";
-
-        patches = prevAttrs.patches ++ [
-          # Add back install rule for device library
-          # This workaround is needed because rocm_install_targets
-          # doesn't support an EXCLUDE_FROM_ALL option
-          ./device-install.patch
-        ];
-
-        buildFlags = [ "rocfft-device-${target}" ];
-
-        installPhase = ''
-          runHook preInstall
-          cmake --install . --component device
-          runHook postInstall
-        '';
-
-        requiredSystemFeatures = [ "big-parallel" ];
-      })).override {
-        gpuTargets = [ target ];
-      }
-    )
-    gpuTargets;
-
-  # TODO: Figure out how to also split this by GPU target
-  #
-  # It'll be bit more complicated than what we're doing for the kernel
-  # device libs, because the kernel cache needs to be compiled into
-  # one sqlite database (whereas the device libs can be linked into
-  # rocfft as separate libraries for each GPU target).
-  #
-  # It's not clear why this needs to even be a db in the first place.
-  # It would simplify things A LOT if we could just store these
-  # pre-compiled kernels as files (but that'd need a lot of patching).
-  kernelRtcCache = rocfft.overrideAttrs (_: {
-    pname = "rocfft-kernel-cache";
-
-    buildFlags = [ "rocfft_kernel_cache_target" ];
-
-    installPhase = ''
-      runHook preInstall
-      cmake --install . --component kernel_cache
-      runHook postInstall
-    '';
-
-    requiredSystemFeatures = [ "big-parallel" ];
-  });
-in
 stdenv.mkDerivation (finalAttrs: {
   pname = "rocfft";
   version = "5.7.0";
@@ -92,40 +28,28 @@ stdenv.mkDerivation (finalAttrs: {
     hash = "sha256-GZSi03geTT+NUztBWhGYyghLqJGsFjUQzVAKQ7d03uA=";
   };
 
-  patches = [
-    # Exclude kernel compilation & installation from "all" target,
-    # and split device libraries by GPU target
-    ./split-kernel-compilation.patch
-  ];
-
   nativeBuildInputs = [
     cmake
-    hip
+    clr
     python3
     rocm-cmake
   ];
 
-  buildInputs = [
-    sqlite
-  ] ++ lib.optionals (finalAttrs.pname == "rocfft") kernelDeviceLibs;
+  buildInputs = [ sqlite ];
 
   cmakeFlags = [
     "-DCMAKE_C_COMPILER=hipcc"
     "-DCMAKE_CXX_COMPILER=hipcc"
-    "-DUSE_HIP_CLANG=ON"
     "-DSQLITE_USE_SYSTEM_PACKAGE=ON"
     # Manually define CMAKE_INSTALL_<DIR>
     # See: https://github.com/NixOS/nixpkgs/pull/197838
     "-DCMAKE_INSTALL_BINDIR=bin"
     "-DCMAKE_INSTALL_LIBDIR=lib"
     "-DCMAKE_INSTALL_INCLUDEDIR=include"
+  ] ++ lib.optionals (gpuTargets != [ ]) [
     "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
   ];
 
-  postInstall = lib.optionalString (finalAttrs.pname == "rocfft") ''
-    ln -s ${kernelRtcCache}/lib/rocfft_kernel_cache.db "$out/lib"
-  '';
-
   passthru = {
     test = stdenv.mkDerivation {
       pname = "${finalAttrs.pname}-test";
@@ -135,7 +59,7 @@ stdenv.mkDerivation (finalAttrs: {
 
       nativeBuildInputs = [
         cmake
-        hip
+        clr
         rocm-cmake
       ];
 
@@ -168,7 +92,7 @@ stdenv.mkDerivation (finalAttrs: {
 
       nativeBuildInputs = [
         cmake
-        hip
+        clr
         rocm-cmake
       ];
 
@@ -201,7 +125,7 @@ stdenv.mkDerivation (finalAttrs: {
 
       nativeBuildInputs = [
         cmake
-        hip
+        clr
         rocm-cmake
       ];
 
@@ -232,6 +156,8 @@ stdenv.mkDerivation (finalAttrs: {
     };
   };
 
+  requiredSystemFeatures = [ "big-parallel" ];
+
   meta = with lib; {
     description = "FFT implementation for ROCm";
     homepage = "https://github.com/ROCmSoftwarePlatform/rocFFT";
diff --git a/pkgs/development/rocm-modules/5/rocfft/device-install.patch b/pkgs/development/rocm-modules/5/rocfft/device-install.patch
deleted file mode 100644
index 355cf30d07ff..000000000000
--- a/pkgs/development/rocm-modules/5/rocfft/device-install.patch
+++ /dev/null
@@ -1,15 +0,0 @@
-diff --git a/library/src/device/CMakeLists.txt b/library/src/device/CMakeLists.txt
-index 73a8ec9..9bfd4b8 100644
---- a/library/src/device/CMakeLists.txt
-+++ b/library/src/device/CMakeLists.txt
-@@ -255,4 +255,10 @@ foreach( sub ${AMDGPU_TARGETS} )
-   if( NOT BUILD_SHARED_LIBS )
-     set_target_properties( rocfft-device-${sub} PROPERTIES PREFIX "lib" )
-   endif( )
-+
-+  rocm_install_targets(
-+    TARGETS
-+    rocfft-device-${sub}
-+    COMPONENT device
-+  )
- endforeach()
diff --git a/pkgs/development/rocm-modules/5/rocfft/split-kernel-compilation.patch b/pkgs/development/rocm-modules/5/rocfft/split-kernel-compilation.patch
deleted file mode 100644
index 5d71fe399c1a..000000000000
--- a/pkgs/development/rocm-modules/5/rocfft/split-kernel-compilation.patch
+++ /dev/null
@@ -1,124 +0,0 @@
-diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
-index 3a16304..606b711 100644
---- a/library/src/CMakeLists.txt
-+++ b/library/src/CMakeLists.txt
-@@ -250,12 +250,12 @@ foreach( target
- 
- endforeach()
- 
--add_executable( rocfft_aot_helper
-+add_executable( rocfft_aot_helper EXCLUDE_FROM_ALL
-   rocfft_aot_helper.cpp
-   rocfft_stub.cpp
- )
- 
--add_executable( rocfft_config_search
-+add_executable( rocfft_config_search EXCLUDE_FROM_ALL
-   rocfft_config_search.cpp
-   rocfft_stub.cpp
- )
-@@ -279,10 +279,10 @@ endif()
- 
- target_link_libraries( rocfft PRIVATE ${ROCFFT_DEVICE_LINK_LIBS} )
- 
--target_link_libraries( rocfft PRIVATE rocfft-device-0 )
--target_link_libraries( rocfft PRIVATE rocfft-device-1 )
--target_link_libraries( rocfft PRIVATE rocfft-device-2 )
--target_link_libraries( rocfft PRIVATE rocfft-device-3 )
-+foreach( sub ${AMDGPU_TARGETS} )
-+  target_link_libraries( rocfft PRIVATE -lrocfft-device-${sub} )
-+endforeach()
-+
- foreach( target rocfft rocfft_aot_helper rocfft_config_search )
-   # RTC uses dladdr to find the RTC helper program
-   if( NOT WIN32 )
-@@ -347,7 +347,7 @@ add_custom_command(
-   DEPENDS rocfft_aot_helper rocfft_rtc_helper
-   COMMENT "Compile kernels into shipped cache file"
- )
--add_custom_target( rocfft_kernel_cache_target ALL
-+add_custom_target( rocfft_kernel_cache_target
-   DEPENDS rocfft_kernel_cache.db
-   VERBATIM
- )
-@@ -392,7 +392,8 @@ else()
- endif()
- rocm_install(FILES ${ROCFFT_KERNEL_CACHE_PATH}
-   DESTINATION "${ROCFFT_KERNEL_CACHE_INSTALL_DIR}"
--  COMPONENT runtime
-+  COMPONENT kernel_cache
-+  EXCLUDE_FROM_ALL
- )
- 
- #         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
-diff --git a/library/src/device/CMakeLists.txt b/library/src/device/CMakeLists.txt
-index 9f7b85f..73a8ec9 100644
---- a/library/src/device/CMakeLists.txt
-+++ b/library/src/device/CMakeLists.txt
-@@ -170,11 +170,11 @@ list( SORT rocfft_device_source )
- # functions callable by rocFFT and depends on amdhip64, and another
- # one usable by AOT RTC that contains no device code
- list( FILTER rocfft_device_source EXCLUDE REGEX function_pool.cpp )
--add_library( rocfft-function-pool OBJECT
-+add_library( rocfft-function-pool OBJECT EXCLUDE_FROM_ALL
-   function_pool.cpp
- )
- target_compile_definitions( rocfft-function-pool PRIVATE FUNCTION_POOL_STANDALONE_BODY= )
--add_library( rocfft-function-pool-standalone OBJECT
-+add_library( rocfft-function-pool-standalone OBJECT EXCLUDE_FROM_ALL
-   function_pool.cpp
- )
- target_compile_definitions( rocfft-function-pool-standalone PRIVATE FUNCTION_POOL_STANDALONE_BODY={} )
-@@ -193,26 +193,15 @@ foreach( pool rocfft-function-pool rocfft-function-pool-standalone )
-   add_dependencies(${pool} gen_headers_target)
- endforeach()
- 
--list( LENGTH rocfft_device_source rocfft_device_source_len )
--math(EXPR split_len "${rocfft_device_source_len} / 4")
--math(EXPR split_idx_2 "${rocfft_device_source_len} / 4 * 2")
--math(EXPR split_idx_3 "${rocfft_device_source_len} / 4 * 3")
--
--list( SUBLIST rocfft_device_source 0 ${split_len} rocfft_device_source_0 )
--list( SUBLIST rocfft_device_source ${split_len} ${split_len} rocfft_device_source_1 )
--list( SUBLIST rocfft_device_source ${split_idx_2} ${split_len} rocfft_device_source_2 )
--list( SUBLIST rocfft_device_source ${split_idx_3} -1 rocfft_device_source_3 )
--
--foreach( sub RANGE 3 )
--  set( rocfft_device_source_var rocfft_device_source_${sub} )
-+foreach( sub ${AMDGPU_TARGETS} )
-   if(NOT SINGLELIB)
--    add_library( rocfft-device-${sub}
--      ${${rocfft_device_source_var}} )
-+    add_library( rocfft-device-${sub} EXCLUDE_FROM_ALL
-+      ${rocfft_device_source} )
-   else()
-     # Compile the device lib as a static library, which is then linked
-     # into librocfft.so Useful for testing purposes.
--    add_library( rocfft-device-${sub} STATIC
--      ${${rocfft_device_source_var}} )
-+    add_library( rocfft-device-${sub} STATIC EXCLUDE_FROM_ALL
-+      ${rocfft_device_source} )
- 
-     # if we're building singlelib, we don't want to export any of the
-     # device library symbols to the main library
-@@ -241,9 +230,7 @@ foreach( sub RANGE 3 )
-   # Set AMD GPU architecture options
- 
-   # Enable compilation of desired architectures
--  foreach( target ${AMDGPU_TARGETS} )
--    target_compile_options( rocfft-device-${sub} PRIVATE --offload-arch=${target} )
--  endforeach( )
-+  target_compile_options( rocfft-device-${sub} PRIVATE --offload-arch=${sub} )
- 
-   target_include_directories( rocfft-device-${sub}
-     PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-@@ -268,9 +255,4 @@ foreach( sub RANGE 3 )
-   if( NOT BUILD_SHARED_LIBS )
-     set_target_properties( rocfft-device-${sub} PROPERTIES PREFIX "lib" )
-   endif( )
--
--  rocm_install_targets(
--    TARGETS
--    rocfft-device-${sub}
--    )
- endforeach()
diff --git a/pkgs/development/rocm-modules/5/rocmlir/default.nix b/pkgs/development/rocm-modules/5/rocmlir/default.nix
index a2a4923148a0..9b24112dce8a 100644
--- a/pkgs/development/rocm-modules/5/rocmlir/default.nix
+++ b/pkgs/development/rocm-modules/5/rocmlir/default.nix
@@ -3,28 +3,35 @@
 , fetchFromGitHub
 , rocmUpdateScript
 , cmake
+, rocm-cmake
 , ninja
-, hip
-, rocminfo
+, clr
 , git
 , libxml2
 , libedit
+, zstd
 , zlib
 , ncurses
-, python3
+, python3Packages
 , buildRockCompiler ? false
+, buildTests ? false # `argument of type 'NoneType' is not iterable`
 }:
 
 # Theoretically, we could have our MLIR have an output
 # with the source and built objects so that we can just
 # use it as the external LLVM repo for this
 let
+  suffix =
+    if buildRockCompiler
+    then "-rock"
+    else "";
+
   llvmNativeTarget =
     if stdenv.isx86_64 then "X86"
     else if stdenv.isAarch64 then "AArch64"
     else throw "Unsupported ROCm LLVM platform";
 in stdenv.mkDerivation (finalAttrs: {
-  pname = "rocmlir";
+  pname = "rocmlir${suffix}";
   version = "5.7.0";
 
   outputs = [
@@ -42,44 +49,61 @@ in stdenv.mkDerivation (finalAttrs: {
 
   nativeBuildInputs = [
     cmake
+    rocm-cmake
     ninja
-  ] ++ lib.optionals (!buildRockCompiler) [
-    hip
+    clr
+    python3Packages.python
+    python3Packages.tomli
   ];
 
   buildInputs = [
     git
     libxml2
     libedit
-    python3
   ];
 
   propagatedBuildInputs = [
+    zstd
     zlib
     ncurses
   ];
 
   cmakeFlags = [
     "-DLLVM_TARGETS_TO_BUILD=AMDGPU;${llvmNativeTarget}"
+    "-DLLVM_ENABLE_ZSTD=ON"
     "-DLLVM_ENABLE_ZLIB=ON"
     "-DLLVM_ENABLE_TERMINFO=ON"
+    "-DROCM_PATH=${clr}"
+    # Manually define CMAKE_INSTALL_<DIR>
+    # See: https://github.com/NixOS/nixpkgs/pull/197838
+    "-DCMAKE_INSTALL_BINDIR=bin"
+    "-DCMAKE_INSTALL_LIBDIR=lib"
+    "-DCMAKE_INSTALL_INCLUDEDIR=include"
   ] ++ lib.optionals buildRockCompiler [
     "-DBUILD_FAT_LIBROCKCOMPILER=ON"
   ] ++ lib.optionals (!buildRockCompiler) [
-    "-DROCM_PATH=${rocminfo}"
     "-DROCM_TEST_CHIPSET=gfx000"
   ];
 
+  postPatch = ''
+    patchShebangs mlir
+
+    substituteInPlace mlir/utils/performance/common/CMakeLists.txt \
+      --replace "/opt/rocm" "${clr}"
+  '';
+
   dontBuild = true;
   doCheck = true;
 
   # Certain libs aren't being generated, try enabling tests next update
   checkTarget = if buildRockCompiler
                 then "librockCompiler"
-                else "check-mlir-miopen-build-only";
+                else if buildTests
+                then "check-rocmlir"
+                else "check-rocmlir-build-only";
 
   postInstall = let
-    libPath = lib.makeLibraryPath [ zlib ncurses hip stdenv.cc.cc ];
+    libPath = lib.makeLibraryPath [ zstd zlib ncurses clr stdenv.cc.cc ];
   in lib.optionals (!buildRockCompiler) ''
     mkdir -p $external/lib
     cp -a external/llvm-project/llvm/lib/{*.a*,*.so*} $external/lib
diff --git a/pkgs/development/rocm-modules/5/rocprim/default.nix b/pkgs/development/rocm-modules/5/rocprim/default.nix
index e8233547664f..1dd2555c6915 100644
--- a/pkgs/development/rocm-modules/5/rocprim/default.nix
+++ b/pkgs/development/rocm-modules/5/rocprim/default.nix
@@ -4,11 +4,12 @@
 , rocmUpdateScript
 , cmake
 , rocm-cmake
-, hip
+, clr
 , gtest
 , gbenchmark
 , buildTests ? false
 , buildBenchmarks ? false
+, gpuTargets ? [ ]
 }:
 
 stdenv.mkDerivation (finalAttrs: {
@@ -33,7 +34,7 @@ stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     rocm-cmake
-    hip
+    clr
   ];
 
   buildInputs = lib.optionals buildTests [
@@ -49,6 +50,8 @@ stdenv.mkDerivation (finalAttrs: {
     "-DCMAKE_INSTALL_BINDIR=bin"
     "-DCMAKE_INSTALL_LIBDIR=lib"
     "-DCMAKE_INSTALL_INCLUDEDIR=include"
+  ] ++ lib.optionals (gpuTargets != [ ]) [
+    "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
   ] ++ lib.optionals buildTests [
     "-DBUILD_TEST=ON"
   ] ++ lib.optionals buildBenchmarks [
@@ -58,6 +61,7 @@ stdenv.mkDerivation (finalAttrs: {
   postInstall = lib.optionalString buildTests ''
     mkdir -p $test/bin
     mv $out/bin/test_* $test/bin
+    mv $out/bin/rocprim $test/bin
   '' + lib.optionalString buildBenchmarks ''
     mkdir -p $benchmark/bin
     mv $out/bin/benchmark_* $benchmark/bin
diff --git a/pkgs/development/rocm-modules/5/rocrand/default.nix b/pkgs/development/rocm-modules/5/rocrand/default.nix
index daa24b870ceb..954a299e317e 100644
--- a/pkgs/development/rocm-modules/5/rocrand/default.nix
+++ b/pkgs/development/rocm-modules/5/rocrand/default.nix
@@ -4,11 +4,12 @@
 , rocmUpdateScript
 , cmake
 , rocm-cmake
-, hip
+, clr
 , gtest
 , gbenchmark
 , buildTests ? false
 , buildBenchmarks ? false
+, gpuTargets ? [ ]
 }:
 
 stdenv.mkDerivation (finalAttrs: {
@@ -34,7 +35,7 @@ stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     rocm-cmake
-    hip
+    clr
   ];
 
   buildInputs = lib.optionals buildTests [
@@ -46,12 +47,14 @@ stdenv.mkDerivation (finalAttrs: {
   cmakeFlags = [
     "-DCMAKE_C_COMPILER=hipcc"
     "-DCMAKE_CXX_COMPILER=hipcc"
-    "-DHIP_ROOT_DIR=${hip}"
+    "-DHIP_ROOT_DIR=${clr}"
     # Manually define CMAKE_INSTALL_<DIR>
     # See: https://github.com/NixOS/nixpkgs/pull/197838
     "-DCMAKE_INSTALL_BINDIR=bin"
     "-DCMAKE_INSTALL_LIBDIR=lib"
     "-DCMAKE_INSTALL_INCLUDEDIR=include"
+  ] ++ lib.optionals (gpuTargets != [ ]) [
+    "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
   ] ++ lib.optionals buildTests [
     "-DBUILD_TEST=ON"
   ] ++ lib.optionals buildBenchmarks [
diff --git a/pkgs/development/rocm-modules/5/rocsolver/default.nix b/pkgs/development/rocm-modules/5/rocsolver/default.nix
index 3a0858af6335..48bf0950351c 100644
--- a/pkgs/development/rocm-modules/5/rocsolver/default.nix
+++ b/pkgs/development/rocm-modules/5/rocsolver/default.nix
@@ -5,7 +5,8 @@
 , cmake
 , rocm-cmake
 , rocblas
-, hip
+, rocsparse
+, clr
 , fmt
 , gtest
 , gfortran
@@ -37,13 +38,14 @@ stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     rocm-cmake
-    hip
+    clr
   ] ++ lib.optionals (buildTests || buildBenchmarks) [
     gfortran
   ];
 
   buildInputs = [
     rocblas
+    rocsparse
     fmt
   ] ++ lib.optionals buildTests [
     gtest
@@ -53,6 +55,7 @@ stdenv.mkDerivation (finalAttrs: {
 
   cmakeFlags = [
     "-DCMAKE_CXX_COMPILER=hipcc"
+    "-DCMAKE_CXX_FLAGS=-Wno-switch" # Way too many warnings
     # Manually define CMAKE_INSTALL_<DIR>
     # See: https://github.com/NixOS/nixpkgs/pull/197838
     "-DCMAKE_INSTALL_BINDIR=bin"
diff --git a/pkgs/development/rocm-modules/5/rocsparse/default.nix b/pkgs/development/rocm-modules/5/rocsparse/default.nix
index d97951530119..945e03c0bc8b 100644
--- a/pkgs/development/rocm-modules/5/rocsparse/default.nix
+++ b/pkgs/development/rocm-modules/5/rocsparse/default.nix
@@ -6,7 +6,7 @@
 , cmake
 , rocm-cmake
 , rocprim
-, hip
+, clr
 , gfortran
 , git
 , gtest
@@ -14,6 +14,7 @@
 , python3Packages
 , buildTests ? false
 , buildBenchmarks ? false # Seems to depend on tests
+, gpuTargets ? [ ]
 }:
 
 stdenv.mkDerivation (finalAttrs: {
@@ -38,7 +39,7 @@ stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     rocm-cmake
-    hip
+    clr
     gfortran
   ];
 
@@ -59,6 +60,8 @@ stdenv.mkDerivation (finalAttrs: {
     "-DCMAKE_INSTALL_BINDIR=bin"
     "-DCMAKE_INSTALL_LIBDIR=lib"
     "-DCMAKE_INSTALL_INCLUDEDIR=include"
+  ] ++ lib.optionals (gpuTargets != [ ]) [
+    "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
   ] ++ lib.optionals (buildTests || buildBenchmarks) [
     "-DBUILD_CLIENTS_TESTS=ON"
     "-DCMAKE_MATRICES_DIR=/build/source/matrices"
diff --git a/pkgs/development/rocm-modules/5/rocthrust/default.nix b/pkgs/development/rocm-modules/5/rocthrust/default.nix
index e441709f89f7..b80b161f5799 100644
--- a/pkgs/development/rocm-modules/5/rocthrust/default.nix
+++ b/pkgs/development/rocm-modules/5/rocthrust/default.nix
@@ -5,10 +5,11 @@
 , cmake
 , rocm-cmake
 , rocprim
-, hip
+, clr
 , gtest
 , buildTests ? false
 , buildBenchmarks ? false
+, gpuTargets ? [ ]
 }:
 
 stdenv.mkDerivation (finalAttrs: {
@@ -34,7 +35,7 @@ stdenv.mkDerivation (finalAttrs: {
     cmake
     rocm-cmake
     rocprim
-    hip
+    clr
   ];
 
   buildInputs = lib.optionals buildTests [
@@ -43,12 +44,14 @@ stdenv.mkDerivation (finalAttrs: {
 
   cmakeFlags = [
     "-DCMAKE_CXX_COMPILER=hipcc"
-    "-DHIP_ROOT_DIR=${hip}"
+    "-DHIP_ROOT_DIR=${clr}"
     # Manually define CMAKE_INSTALL_<DIR>
     # See: https://github.com/NixOS/nixpkgs/pull/197838
     "-DCMAKE_INSTALL_BINDIR=bin"
     "-DCMAKE_INSTALL_LIBDIR=lib"
     "-DCMAKE_INSTALL_INCLUDEDIR=include"
+  ] ++ lib.optionals (gpuTargets != [ ]) [
+    "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
   ] ++ lib.optionals buildTests [
     "-DBUILD_TEST=ON"
   ] ++ lib.optionals buildBenchmarks [
diff --git a/pkgs/development/rocm-modules/5/rocwmma/0000-dont-fetch-googletest.patch b/pkgs/development/rocm-modules/5/rocwmma/0000-dont-fetch-googletest.patch
index cee603679758..fa47a3c42249 100644
--- a/pkgs/development/rocm-modules/5/rocwmma/0000-dont-fetch-googletest.patch
+++ b/pkgs/development/rocm-modules/5/rocwmma/0000-dont-fetch-googletest.patch
@@ -1,8 +1,8 @@
 diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
-index e1160bb..2a5462e 100644
+index 0d00883..86ce282 100644
 --- a/test/CMakeLists.txt
 +++ b/test/CMakeLists.txt
-@@ -30,26 +30,6 @@ cmake_dependent_option( ROCWMMA_BUILD_VALIDATION_TESTS "Build validation tests"
+@@ -30,30 +30,6 @@ cmake_dependent_option( ROCWMMA_BUILD_VALIDATION_TESTS "Build validation tests"
  cmake_dependent_option( ROCWMMA_BUILD_BENCHMARK_TESTS "Build benchmarking tests" OFF "ROCWMMA_BUILD_TESTS" OFF )
  cmake_dependent_option( ROCWMMA_BUILD_EXTENDED_TESTS "Build extended test parameter coverage" OFF "ROCWMMA_BUILD_TESTS" OFF )
  
@@ -12,20 +12,24 @@ index e1160bb..2a5462e 100644
 -FetchContent_Declare(
 -  googletest
 -  GIT_REPOSITORY https://github.com/google/googletest.git
--  GIT_TAG 609281088cfefc76f9d0ce82e1ff6c30cc3591e5
+-  GIT_TAG release-1.12.1
 -)
 -FetchContent_GetProperties(googletest)
 -if(NOT googletest_POPULATED)
+-
 -  # Fetch the content using default details
 -  FetchContent_Populate(googletest)
 -  # Save the shared libs setting, then force to static libs
 -  set(BUILD_SHARED_LIBS_OLD ${BUILD_SHARED_LIBS})
 -  set(BUILD_SHARED_LIBS OFF CACHE INTERNAL "Build SHARED libraries" FORCE)
+-  
 -  # Add gtest targets as static libs
 -  add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR})
+-  
 -  # Restore shared libs setting
 -  set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_OLD} CACHE INTERNAL "Build SHARED libraries" FORCE)
 -endif()
- 
+-
  set(ROCWMMA_TEST_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR})
  set(ROCWMMA_COMMON_TEST_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/hip_device.cpp
+                                 ${CMAKE_CURRENT_SOURCE_DIR}/rocwmma_gtest_main.cpp)
diff --git a/pkgs/development/rocm-modules/5/rocwmma/default.nix b/pkgs/development/rocm-modules/5/rocwmma/default.nix
index ef21ed86248a..71d0e3fbe793 100644
--- a/pkgs/development/rocm-modules/5/rocwmma/default.nix
+++ b/pkgs/development/rocm-modules/5/rocwmma/default.nix
@@ -4,44 +4,24 @@
 , rocmUpdateScript
 , cmake
 , rocm-cmake
-, hip
+, rocm-smi
+, clr
 , openmp
 , gtest
 , rocblas
-, texlive
-, doxygen
-, sphinx
-, python3Packages
-, buildDocs ? true
-, buildTests ? false
+, buildTests ? false # Will likely fail building because wavefront shifts are not supported for certain archs
 , buildExtendedTests ? false
 , buildBenchmarks ? false
 , buildSamples ? false
 , gpuTargets ? [ ] # gpuTargets = [ "gfx908:xnack-" "gfx90a:xnack-" "gfx90a:xnack+" ... ]
 }:
 
-let
-  latex = lib.optionalAttrs buildDocs texlive.combine {
-    inherit (texlive) scheme-small
-    latexmk
-    tex-gyre
-    fncychap
-    wrapfig
-    capt-of
-    framed
-    needspace
-    tabulary
-    varwidth
-    titlesec;
-  };
-in stdenv.mkDerivation (finalAttrs: {
+stdenv.mkDerivation (finalAttrs: {
   pname = "rocwmma";
   version = "5.7.0";
 
   outputs = [
     "out"
-  ] ++ lib.optionals buildDocs [
-    "doc"
   ] ++ lib.optionals (buildTests || buildBenchmarks) [
     "test"
   ] ++ lib.optionals buildBenchmarks [
@@ -64,28 +44,21 @@ in stdenv.mkDerivation (finalAttrs: {
   nativeBuildInputs = [
     cmake
     rocm-cmake
-    hip
+    clr
   ];
 
   buildInputs = [
     openmp
   ] ++ lib.optionals (buildTests || buildBenchmarks) [
+    rocm-smi
     gtest
     rocblas
-  ] ++ lib.optionals buildDocs [
-    latex
-    doxygen
-    sphinx
-    python3Packages.sphinx-rtd-theme
-    python3Packages.breathe
   ];
 
   cmakeFlags = [
     "-DCMAKE_CXX_COMPILER=hipcc"
     "-DROCWMMA_BUILD_TESTS=${if buildTests || buildBenchmarks then "ON" else "OFF"}"
-    "-DROCWMMA_BUILD_VALIDATION_TESTS=ON"
     "-DROCWMMA_BUILD_SAMPLES=${if buildSamples then "ON" else "OFF"}"
-    "-DROCWMMA_VALIDATE_WITH_ROCBLAS=ON"
     # Manually define CMAKE_INSTALL_<DIR>
     # See: https://github.com/NixOS/nixpkgs/pull/197838
     "-DCMAKE_INSTALL_BINDIR=bin"
@@ -100,21 +73,7 @@ in stdenv.mkDerivation (finalAttrs: {
     "-DROCWMMA_BENCHMARK_WITH_ROCBLAS=ON"
   ];
 
-  postPatch = lib.optionalString buildDocs ''
-    patchShebangs docs/*.sh
-  '';
-
-  # Unfortunately, it seems like we have to call make on this manually
-  # -DROCWMMA_BUILD_DOCS=ON is invalid, despite being on the README
-  postBuild = lib.optionalString buildDocs ''
-    export HOME=$(mktemp -d)
-    ../docs/run_doc.sh
-  '';
-
-  postInstall = lib.optionalString buildDocs ''
-    mv ../docs/source/_build/html $out/share/doc/rocwmma
-    mv ../docs/source/_build/latex/rocWMMA.pdf $out/share/doc/rocwmma
-  '' + lib.optionalString (buildTests || buildBenchmarks) ''
+  postInstall = lib.optionalString (buildTests || buildBenchmarks) ''
     mkdir -p $test/bin
     mv $out/bin/{*_test,*-validate} $test/bin
   '' + lib.optionalString buildBenchmarks ''
diff --git a/pkgs/development/rocm-modules/5/tensile/default.nix b/pkgs/development/rocm-modules/5/tensile/default.nix
index 7d0165a42060..86dbaa95e192 100644
--- a/pkgs/development/rocm-modules/5/tensile/default.nix
+++ b/pkgs/development/rocm-modules/5/tensile/default.nix
@@ -3,14 +3,20 @@
 , fetchFromGitHub
 , rocmUpdateScript
 , buildPythonPackage
+, pytestCheckHook
+, setuptools
 , pyyaml
 , msgpack
 , pandas
+, joblib
+, filelock
+, rocminfo
 }:
 
 buildPythonPackage rec {
   pname = "tensile";
   version = "5.7.0";
+  format = "pyproject";
 
   src = fetchFromGitHub {
     owner = "ROCmSoftwarePlatform";
@@ -19,12 +25,29 @@ buildPythonPackage rec {
     hash = "sha256-CyPGiM/53duJc/oNtOsl6JSsl9uOOYm5R7O6YXaVOm4=";
   };
 
-  buildInputs = [
+  buildInputs = [ setuptools ];
+
+  propagatedBuildInputs = [
     pyyaml
     msgpack
     pandas
+    joblib
   ];
 
+  doCheck = false; # Too many errors, not sure how to set this up properly
+
+  nativeCheckInputs = [
+    pytestCheckHook
+    filelock
+    rocminfo
+  ];
+
+  preCheck = ''
+    export ROCM_PATH=${rocminfo}
+  '';
+
+  pythonImportsCheck = [ "Tensile" ];
+
   passthru.updateScript = rocmUpdateScript {
     name = pname;
     owner = src.owner;

From 05de74ecf3083ec7a4479dc131a24167f0dfd3d0 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Thu, 5 Oct 2023 02:35:52 -0500
Subject: [PATCH 19/30] rocmPackages.rocm-docs-core: init at 0.25.0

---
 pkgs/development/rocm-modules/5/default.nix   |  2 +
 .../rocm-modules/5/rocm-docs-core/default.nix | 65 +++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 pkgs/development/rocm-modules/5/rocm-docs-core/default.nix

diff --git a/pkgs/development/rocm-modules/5/default.nix b/pkgs/development/rocm-modules/5/default.nix
index d3effc777613..e0295f08720f 100644
--- a/pkgs/development/rocm-modules/5/default.nix
+++ b/pkgs/development/rocm-modules/5/default.nix
@@ -70,6 +70,8 @@ in rec {
     # stdenv = llvm.rocmClangStdenv;
   };
 
+  rocm-docs-core = python3Packages.callPackage ./rocm-docs-core { };
+
   ## ROCm-Developer-Tools ##
   hip-common = callPackage ./hip-common {
     inherit rocmUpdateScript;
diff --git a/pkgs/development/rocm-modules/5/rocm-docs-core/default.nix b/pkgs/development/rocm-modules/5/rocm-docs-core/default.nix
new file mode 100644
index 000000000000..65d21daa5b50
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/rocm-docs-core/default.nix
@@ -0,0 +1,65 @@
+{ lib
+, stdenv
+, fetchFromGitHub
+, gitUpdater
+, buildPythonPackage
+, setuptools
+, beautifulsoup4
+, gitpython
+, pydata-sphinx-theme
+, pygithub
+, sphinx
+, breathe
+, myst-parser
+, sphinx-book-theme
+, sphinx-copybutton
+, sphinx-design
+, sphinx-external-toc
+, sphinx-notfound-page
+, pyyaml
+, fastjsonschema
+}:
+
+buildPythonPackage rec {
+  pname = "rocm-docs-core";
+  version = "0.25.0";
+  format = "pyproject";
+
+  src = fetchFromGitHub {
+    owner = "RadeonOpenCompute";
+    repo = "rocm-docs-core";
+    rev = "v${version}";
+    hash = "sha256-kOsoIK0vaPT60hGr960s5vc0eloSr5CECtd8Dy24YuM=";
+  };
+
+  buildInputs = [ setuptools ];
+
+  propagatedBuildInputs = [
+    beautifulsoup4
+    gitpython
+    pydata-sphinx-theme
+    pygithub
+    sphinx
+    breathe
+    myst-parser
+    sphinx-book-theme
+    sphinx-copybutton
+    sphinx-design
+    sphinx-external-toc
+    sphinx-notfound-page
+    pyyaml
+    fastjsonschema
+  ];
+
+  pythonImportsCheck = [ "rocm_docs" ];
+
+  passthru.updateScript = gitUpdater { rev-prefix = "v"; };
+
+  meta = with lib; {
+    description = "ROCm Documentation Python package for ReadTheDocs build standardization";
+    homepage = "https://github.com/RadeonOpenCompute/rocm-docs-core";
+    license = with licenses; [ mit cc-by-40 ];
+    maintainers = teams.rocm.members;
+    platforms = platforms.linux;
+  };
+}

From a3a45bea58e965b90e1abcea08feed78035b32db Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Thu, 5 Oct 2023 08:39:10 -0500
Subject: [PATCH 20/30] rocmPackages.composable_kernel: unstable-2023-01-16 ->
 5.7.0

---
 .../5/composable_kernel/default.nix           | 24 ++++++++++---------
 pkgs/development/rocm-modules/5/default.nix   |  1 +
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/pkgs/development/rocm-modules/5/composable_kernel/default.nix b/pkgs/development/rocm-modules/5/composable_kernel/default.nix
index 54bf1251c309..2372b27ebe52 100644
--- a/pkgs/development/rocm-modules/5/composable_kernel/default.nix
+++ b/pkgs/development/rocm-modules/5/composable_kernel/default.nix
@@ -1,10 +1,10 @@
 { lib
 , stdenv
 , fetchFromGitHub
-, unstableGitUpdater
+, rocmUpdateScript
 , cmake
 , rocm-cmake
-, hip
+, clr
 , openmp
 , clang-tools-extra
 , gtest
@@ -15,7 +15,7 @@
 
 stdenv.mkDerivation (finalAttrs: {
   pname = "composable_kernel";
-  version = "unstable-2023-01-16";
+  version = "5.7.0";
 
   outputs = [
     "out"
@@ -25,24 +25,21 @@ stdenv.mkDerivation (finalAttrs: {
     "example"
   ];
 
-  # ROCm 5.6 should release composable_kernel as stable with a tag in the future
   src = fetchFromGitHub {
     owner = "ROCmSoftwarePlatform";
     repo = "composable_kernel";
-    rev = "80e05267417f948e4f7e63c0fe807106d9a0c0ef";
-    hash = "sha256-+c0E2UtlG/abweLwCWWjNHDO5ZvSIVKwwwettT9mqR4=";
+    rev = "rocm-${finalAttrs.version}";
+    hash = "sha256-Z9X+S2SijGJ8bhr9ghkkWicBUzLzs9fxPpqZxX6BBM4=";
   };
 
   nativeBuildInputs = [
     cmake
     rocm-cmake
-    hip
+    clr
     clang-tools-extra
   ];
 
-  buildInputs = [
-    openmp
-  ];
+  buildInputs = [ openmp ];
 
   cmakeFlags = [
     "-DCMAKE_C_COMPILER=hipcc"
@@ -71,7 +68,11 @@ stdenv.mkDerivation (finalAttrs: {
     mv $out/bin/example_* $example/bin
   '';
 
-  passthru.updateScript = unstableGitUpdater { };
+  passthru.updateScript = rocmUpdateScript {
+    name = finalAttrs.pname;
+    owner = finalAttrs.src.owner;
+    repo = finalAttrs.src.repo;
+  };
 
   # Times out otherwise
   requiredSystemFeatures = [ "big-parallel" ];
@@ -82,5 +83,6 @@ stdenv.mkDerivation (finalAttrs: {
     license = with licenses; [ mit ];
     maintainers = teams.rocm.members;
     platforms = platforms.linux;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
   };
 })
diff --git a/pkgs/development/rocm-modules/5/default.nix b/pkgs/development/rocm-modules/5/default.nix
index e0295f08720f..838874f398e2 100644
--- a/pkgs/development/rocm-modules/5/default.nix
+++ b/pkgs/development/rocm-modules/5/default.nix
@@ -237,6 +237,7 @@ in rec {
   };
 
   composable_kernel = callPackage ./composable_kernel {
+    inherit rocmUpdateScript rocm-cmake clr;
     inherit (llvm) openmp clang-tools-extra;
     stdenv = llvm.rocmClangStdenv;
   };

From ae91d1330e833c8b20be31317cb6e27a43974ef5 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Thu, 5 Oct 2023 11:52:09 -0500
Subject: [PATCH 21/30] rocmPackages.clr: replace rocm-opencl-icd

---
 .../rocm-modules/5/clr/default.nix            | 18 +++++++++++++++
 pkgs/development/rocm-modules/5/clr/test.nix  | 23 +++++++++++++++++++
 pkgs/development/rocm-modules/5/default.nix   |  2 +-
 3 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 pkgs/development/rocm-modules/5/clr/test.nix

diff --git a/pkgs/development/rocm-modules/5/clr/default.nix b/pkgs/development/rocm-modules/5/clr/default.nix
index d3f811dcc422..c0d4de8cbb7c 100644
--- a/pkgs/development/rocm-modules/5/clr/default.nix
+++ b/pkgs/development/rocm-modules/5/clr/default.nix
@@ -1,5 +1,6 @@
 { lib
 , stdenv
+, callPackage
 , fetchFromGitHub
 , rocmUpdateScript
 , makeWrapper
@@ -13,6 +14,7 @@
 , rocm-runtime
 , roctracer
 , rocminfo
+, rocm-smi
 , numactl
 , libGL
 , libxml2
@@ -35,6 +37,11 @@ in stdenv.mkDerivation (finalAttrs: {
   pname = "clr";
   version = "5.7.0";
 
+  outputs = [
+    "out"
+    "icd"
+  ];
+
   src = fetchFromGitHub {
     owner = "ROCm-Developer-Tools";
     repo = "clr";
@@ -106,6 +113,10 @@ in stdenv.mkDerivation (finalAttrs: {
 
     # Just link rocminfo, it's easier
     ln -s ${rocminfo}/bin/* $out/bin
+
+    # Replace rocm-opencl-icd functionality
+    mkdir -p $icd/etc/OpenCL/vendors
+    echo "$out/lib/libamdocl64.so" > $icd/etc/OpenCL/vendors/amdocl64.icd
   '';
 
   passthru = {
@@ -134,6 +145,13 @@ in stdenv.mkDerivation (finalAttrs: {
       owner = finalAttrs.src.owner;
       repo = finalAttrs.src.repo;
     };
+
+    impureTests = {
+      clr-icd = callPackage ./test.nix {
+        inherit rocm-smi;
+        clr = finalAttrs.finalPackage;
+      };
+    };
   };
 
   meta = with lib; {
diff --git a/pkgs/development/rocm-modules/5/clr/test.nix b/pkgs/development/rocm-modules/5/clr/test.nix
new file mode 100644
index 000000000000..c02bb4da8886
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/clr/test.nix
@@ -0,0 +1,23 @@
+{ lib
+, makeImpureTest
+, clinfo
+, clr
+, rocm-smi
+}:
+
+makeImpureTest {
+  name = "clr-icd";
+  testedPackage = "rocmPackages.clr";
+  nativeBuildInputs = [ clinfo rocm-smi ];
+  OCL_ICD_VENDORS = "${clr.icd}/etc/OpenCL/vendors";
+
+  testScript = ''
+    # Test fails if the number of platforms is 0
+    clinfo | grep -E 'Number of platforms * [1-9]'
+    rocm-smi | grep -A1 GPU
+  '';
+
+  meta = with lib; {
+    maintainers = teams.rocm.members;
+  };
+}
diff --git a/pkgs/development/rocm-modules/5/default.nix b/pkgs/development/rocm-modules/5/default.nix
index 838874f398e2..ac8a32ef5176 100644
--- a/pkgs/development/rocm-modules/5/default.nix
+++ b/pkgs/development/rocm-modules/5/default.nix
@@ -86,7 +86,7 @@ in rec {
 
   # Replaces hip, opencl-runtime, and rocclr
   clr = callPackage ./clr {
-    inherit rocmUpdateScript hip-common hipcc rocm-device-libs rocm-comgr rocm-runtime roctracer rocminfo;
+    inherit rocmUpdateScript hip-common hipcc rocm-device-libs rocm-comgr rocm-runtime roctracer rocminfo rocm-smi;
     inherit (llvm) clang;
     stdenv = llvm.rocmClangStdenv;
   };

From fb9321020577ac1ddaabc81488dbb700350eb4e5 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Thu, 5 Oct 2023 13:05:33 -0500
Subject: [PATCH 22/30] nixos/doc: note ROCm changes

---
 nixos/doc/manual/release-notes/rl-2311.section.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/nixos/doc/manual/release-notes/rl-2311.section.md b/nixos/doc/manual/release-notes/rl-2311.section.md
index d1bbb5c389a7..39011f392986 100644
--- a/nixos/doc/manual/release-notes/rl-2311.section.md
+++ b/nixos/doc/manual/release-notes/rl-2311.section.md
@@ -26,6 +26,9 @@
 
 [`sudo-rs`]: https://github.com/memorysafety/sudo-rs/
 
+- All [ROCm](https://rocm.docs.amd.com/en/latest/) packages have been updated to 5.7.0.
+  - [ROCm](https://rocm.docs.amd.com/en/latest/) package attribute sets are versioned: `rocmPackages` -> `rocmPackages_5`.
+
 ## New Services {#sec-release-23.11-new-services}
 
 - [MCHPRS](https://github.com/MCHPR/MCHPRS), a multithreaded Minecraft server built for redstone. Available as [services.mchprs](#opt-services.mchprs.enable).
@@ -148,6 +151,17 @@
 
 - `consul` has been updated to `1.16.0`. See the [release note](https://github.com/hashicorp/consul/releases/tag/v1.16.0) for more details. Once a new Consul version has started and upgraded its data directory, it generally cannot be downgraded to the previous version.
 
+- `llvmPackages_rocm` has been moved to `rocmPackages.llvm`.
+
+- `hip`, `rocm-opencl-runtime`, `rocm-opencl-icd`, and `rocclr` have been combined into `rocmPackages.clr`.
+
+- `clang-ocl`, `clr`, `composable_kernel`, `hipblas`, `hipcc`, `hip-common`, `hipcub`,
+  `hipfft`, `hipfort`, `hipify`, `hipsolver`, `hipsparse`, `migraphx`, `miopen`, `miopengemm`,
+  `rccl`, `rdc`, `rocalution`, `rocblas`, `rocdgbapi`, `rocfft`, `rocgdb`, `rocm-cmake`,
+  `rocm-comgr`, `rocm-core`, `rocm-device-libs`, `rocminfo`, `rocmlir`, `rocm-runtime`,
+  `rocm-smi`, `rocm-thunk`, `rocprim`, `rocprofiler`, `rocrand`, `rocr-debug-agent`,
+  `rocsolver`, `rocsparse`, `rocthrust`, `roctracer`, `rocwmma`, and `tensile` have been moved to `rocmPackages`.
+
 - `himalaya` has been updated to `0.8.0`, which drops the native TLS support (in favor of Rustls) and add OAuth 2.0 support. See the [release note](https://github.com/soywod/himalaya/releases/tag/v0.8.0) for more details.
 
 - `nix-prefetch-git` now ignores global and user git config, to improve reproducibility.

From 6f39d63688276f1953fc1ca3afa16d8bbb7a301d Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Thu, 5 Oct 2023 13:07:45 -0500
Subject: [PATCH 23/30] nixos/doc: rocm-opencl-icd -> rocmPackages.clr.icd

---
 nixos/doc/manual/configuration/gpu-accel.chapter.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nixos/doc/manual/configuration/gpu-accel.chapter.md b/nixos/doc/manual/configuration/gpu-accel.chapter.md
index 40878b5da4b5..dfccdf291b73 100644
--- a/nixos/doc/manual/configuration/gpu-accel.chapter.md
+++ b/nixos/doc/manual/configuration/gpu-accel.chapter.md
@@ -26,7 +26,7 @@ directory which is scanned by the ICL loader for ICD files. For example:
 
 ```ShellSession
 $ export \
-  OCL_ICD_VENDORS=`nix-build '<nixpkgs>' --no-out-link -A rocm-opencl-icd`/etc/OpenCL/vendors/
+  OCL_ICD_VENDORS=`nix-build '<nixpkgs>' --no-out-link -A rocmPackages.clr.icd`/etc/OpenCL/vendors/
 ```
 
 The second mechanism is to add the OpenCL driver package to
@@ -50,13 +50,13 @@ Platform Vendor      Advanced Micro Devices, Inc.
 
 Modern AMD [Graphics Core
 Next](https://en.wikipedia.org/wiki/Graphics_Core_Next) (GCN) GPUs are
-supported through the rocm-opencl-icd package. Adding this package to
+supported through the rocmPackages.clr.icd package. Adding this package to
 [](#opt-hardware.opengl.extraPackages)
 enables OpenCL support:
 
 ```nix
 hardware.opengl.extraPackages = [
-  rocm-opencl-icd
+  rocmPackages.clr.icd
 ];
 ```
 

From e6f88a9a824ad8a34bf9d501aa45b0883a4a80c9 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Thu, 5 Oct 2023 16:58:02 -0500
Subject: [PATCH 24/30] rocm-related: fixup for ROCm 5.7.0

blender: fixup for ROCm 5.7.0

opensycl: fixup for ROCm 5.7.0

magma: fixup for ROCm 5.7.0

torch: fixup for ROCm 5.7.0

cp2k: fixup for ROCm 5.7.0

sirius: fixup for ROCm 5.7.0

spfft: fixup for ROCm 5.7.0

spla: fixup for ROCm 5.7.0
---
 pkgs/applications/misc/blender/default.nix    |  6 ++---
 .../science/chemistry/cp2k/default.nix        | 16 +++++++------
 pkgs/by-name/si/sirius/package.nix            | 11 +++++----
 pkgs/by-name/sp/spfft/package.nix             | 13 ++++++-----
 pkgs/by-name/sp/spla/package.nix              |  9 ++++----
 .../compilers/opensycl/default.nix            | 10 ++++----
 .../libraries/science/math/magma/generic.nix  | 19 +++++++--------
 .../python-modules/torch/default.nix          | 23 ++++++++-----------
 pkgs/top-level/all-packages.nix               |  9 ++------
 pkgs/top-level/python-packages.nix            |  1 -
 10 files changed, 54 insertions(+), 63 deletions(-)

diff --git a/pkgs/applications/misc/blender/default.nix b/pkgs/applications/misc/blender/default.nix
index 0b368ef1b315..00bbcdafff13 100644
--- a/pkgs/applications/misc/blender/default.nix
+++ b/pkgs/applications/misc/blender/default.nix
@@ -6,7 +6,7 @@
 , zlib, zstd, fftw, opensubdiv, freetype, jemalloc, ocl-icd, addOpenGLRunpath
 , jackaudioSupport ? false, libjack2
 , cudaSupport ? config.cudaSupport, cudaPackages ? { }
-, hipSupport ? false, hip # comes with a significantly larger closure size
+, hipSupport ? false, rocmPackages # comes with a significantly larger closure size
 , colladaSupport ? true, opencollada
 , spaceNavSupport ? stdenv.isLinux, libspnav
 , makeWrapper
@@ -103,8 +103,8 @@ stdenv.mkDerivation (finalAttrs: rec {
       substituteInPlace extern/clew/src/clew.c --replace '"libOpenCL.so"' '"${ocl-icd}/lib/libOpenCL.so"'
     '') +
     (lib.optionalString hipSupport ''
-      substituteInPlace extern/hipew/src/hipew.c --replace '"/opt/rocm/hip/lib/libamdhip64.so"' '"${hip}/lib/libamdhip64.so"'
-      substituteInPlace extern/hipew/src/hipew.c --replace '"opt/rocm/hip/bin"' '"${hip}/bin"'
+      substituteInPlace extern/hipew/src/hipew.c --replace '"/opt/rocm/hip/lib/libamdhip64.so"' '"${rocmPackages.clr}/lib/libamdhip64.so"'
+      substituteInPlace extern/hipew/src/hipew.c --replace '"opt/rocm/hip/bin"' '"${rocmPackages.clr}/bin"'
     '');
 
   cmakeFlags =
diff --git a/pkgs/applications/science/chemistry/cp2k/default.nix b/pkgs/applications/science/chemistry/cp2k/default.nix
index 052d791c0bb7..bb306fa322fe 100644
--- a/pkgs/applications/science/chemistry/cp2k/default.nix
+++ b/pkgs/applications/science/chemistry/cp2k/default.nix
@@ -37,11 +37,7 @@
 # and for Nvidia see https://github.com/cp2k/cp2k/blob/master/INSTALL.md#2i-cuda-optional-improved-performance-on-gpu-systems
 , gpuVersion ? "Mi100"
 , gpuArch ? "gfx908"
-, rocm-core
-, hip
-, hipblas
-, hipfft
-, rocblas
+, rocmPackages
 }:
 
 assert builtins.elem gpuBackend [ "none" "cuda" "rocm" ];
@@ -86,7 +82,13 @@ stdenv.mkDerivation rec {
   ]
   ++ lib.optional enableElpa elpa
   ++ lib.optional (gpuBackend == "cuda") cudaPackages.cudatoolkit
-  ++ lib.optional (gpuBackend == "rocm") [hip rocm-core hipblas hipfft rocblas]
+  ++ lib.optional (gpuBackend == "rocm") [
+    rocmPackages.clr
+    rocmPackages.rocm-core
+    rocmPackages.hipblas
+    rocmPackages.hipfft
+    rocmPackages.rocblas
+  ]
   ;
 
   propagatedBuildInputs = [ mpi ];
@@ -126,7 +128,7 @@ stdenv.mkDerivation rec {
     ${lib.strings.optionalString (gpuBackend == "rocm") ''
     GPUVER = ${gpuVersion}
     OFFLOAD_CC = hipcc
-    OFFLOAD_FLAGS = -fopenmp -m64 -pthread -fPIC -D__GRID_HIP -O2 --offload-arch=${gpuArch} --rocm-path=${rocm-core}
+    OFFLOAD_FLAGS = -fopenmp -m64 -pthread -fPIC -D__GRID_HIP -O2 --offload-arch=${gpuArch} --rocm-path=${rocmPackages.rocm-core}
     OFFLOAD_TARGET = hip
     CXX = mpicxx
     CXXFLAGS = -std=c++11 -fopenmp -D__HIP_PLATFORM_AMD__
diff --git a/pkgs/by-name/si/sirius/package.nix b/pkgs/by-name/si/sirius/package.nix
index 05d049a7d45b..2af3c28de922 100644
--- a/pkgs/by-name/si/sirius/package.nix
+++ b/pkgs/by-name/si/sirius/package.nix
@@ -23,8 +23,7 @@
 , llvmPackages
 , gpuBackend ? "none"
 , cudaPackages
-, hip
-, rocblas
+, rocmPackages
 }:
 
 assert builtins.elem gpuBackend [ "none" "cuda" "rocm" ];
@@ -67,8 +66,10 @@ stdenv.mkDerivation rec {
     libvdwxc
   ]
   ++ lib.optional (gpuBackend == "cuda") cudaPackages.cudatoolkit
-  ++ lib.optionals (gpuBackend == "rocm") [ hip rocblas ]
-  ++ lib.optional stdenv.isDarwin llvmPackages.openmp
+  ++ lib.optionals (gpuBackend == "rocm") [
+    rocmPackages.clr
+    rocmPackages.rocblas
+  ] ++ lib.optional stdenv.isDarwin llvmPackages.openmp
   ;
 
   propagatedBuildInputs = [ mpi ];
@@ -87,7 +88,7 @@ stdenv.mkDerivation rec {
   ]
   ++ lib.optionals (gpuBackend == "rocm") [
     "-DUSE_ROCM=ON"
-    "-DHIP_ROOT_DIR=${hip}"
+    "-DHIP_ROOT_DIR=${rocmPackages.clr}"
   ];
 
   doCheck = true;
diff --git a/pkgs/by-name/sp/spfft/package.nix b/pkgs/by-name/sp/spfft/package.nix
index dcc43ccd2446..72ae473d14a5 100644
--- a/pkgs/by-name/sp/spfft/package.nix
+++ b/pkgs/by-name/sp/spfft/package.nix
@@ -8,9 +8,7 @@
 , llvmPackages
 , gpuBackend ? "none"
 , cudaPackages
-, hip
-, rocfft
-, hipfft
+, rocmPackages
 }:
 
 assert builtins.elem gpuBackend [ "none" "cuda" "rocm" ];
@@ -35,8 +33,11 @@ stdenv.mkDerivation rec {
     fftw
   ]
   ++ lib.optional (gpuBackend == "cuda") cudaPackages.cudatoolkit
-  ++ lib.optionals (gpuBackend == "rocm") [ hip rocfft hipfft ]
-  ++ lib.optional stdenv.isDarwin llvmPackages.openmp
+  ++ lib.optionals (gpuBackend == "rocm") [
+    rocmPackages.clr
+    rocmPackages.rocfft
+    rocmPackages.hipfft
+  ] ++ lib.optional stdenv.isDarwin llvmPackages.openmp
   ;
 
   propagatedBuildInputs = [ mpi ];
@@ -53,7 +54,7 @@ stdenv.mkDerivation rec {
   ++ lib.optional (gpuBackend == "cuda") "-DSPFFT_GPU_BACKEND=CUDA"
   ++ lib.optionals (gpuBackend == "rocm") [
     "-DSPFFT_GPU_BACKEND=ROCM"
-    "-DHIP_ROOT_DIR=${hip}"
+    "-DHIP_ROOT_DIR=${rocmPackages.clr}"
   ];
 
 
diff --git a/pkgs/by-name/sp/spla/package.nix b/pkgs/by-name/sp/spla/package.nix
index 3143fbeb7316..1f8abde4b723 100644
--- a/pkgs/by-name/sp/spla/package.nix
+++ b/pkgs/by-name/sp/spla/package.nix
@@ -8,8 +8,7 @@
 , llvmPackages
 , gpuBackend ? "none"
 , cudaPackages
-, hip
-, rocblas
+, rocmPackages
 }:
 
 assert builtins.elem gpuBackend [ "none" "cuda" "rocm" ];
@@ -39,8 +38,10 @@ stdenv.mkDerivation rec {
     blas
   ]
   ++ lib.optional (gpuBackend == "cuda") cudaPackages.cudatoolkit
-  ++ lib.optionals (gpuBackend == "rocm") [ hip rocblas rocblas ]
-  ++ lib.optional stdenv.isDarwin llvmPackages.openmp
+  ++ lib.optionals (gpuBackend == "rocm") [
+    rocmPackages.clr
+    rocmPackages.rocblas
+  ] ++ lib.optional stdenv.isDarwin llvmPackages.openmp
   ;
 
   propagatedBuildInputs = [ mpi ];
diff --git a/pkgs/development/compilers/opensycl/default.nix b/pkgs/development/compilers/opensycl/default.nix
index d6f11798f199..995b21330a99 100644
--- a/pkgs/development/compilers/opensycl/default.nix
+++ b/pkgs/development/compilers/opensycl/default.nix
@@ -2,15 +2,13 @@
 , fetchFromGitHub
 , llvmPackages_15
 , lld_15
-, rocm-device-libs
 , python3
-, rocm-runtime
 , cmake
 , boost
 , libxml2
 , libffi
 , makeWrapper
-, hip
+, rocmPackages
 , rocmSupport ? false
 }:
 let
@@ -40,8 +38,8 @@ stdenv.mkDerivation rec {
     llvmPackages_15.libclang.dev
     llvmPackages_15.llvm
   ] ++ lib.optionals rocmSupport [
-    hip
-    rocm-runtime
+    rocmPackages.clr
+    rocmPackages.rocm-runtime
   ];
 
   # opensycl makes use of clangs internal headers. Its cmake does not successfully discover them automatically on nixos, so we supply the path manually
@@ -55,7 +53,7 @@ stdenv.mkDerivation rec {
       --add-flags "-L${llvmPackages_15.openmp}/lib" \
       --add-flags "-I${llvmPackages_15.openmp.dev}/include" \
   '' + lib.optionalString rocmSupport ''
-    --add-flags "--rocm-device-lib-path=${rocm-device-libs}/amdgcn/bitcode"
+    --add-flags "--rocm-device-lib-path=${rocmPackages.rocm-device-libs}/amdgcn/bitcode"
   '';
 
   meta = with lib; {
diff --git a/pkgs/development/libraries/science/math/magma/generic.nix b/pkgs/development/libraries/science/math/magma/generic.nix
index b3753a63339a..e9712ffb62ad 100644
--- a/pkgs/development/libraries/science/math/magma/generic.nix
+++ b/pkgs/development/libraries/science/math/magma/generic.nix
@@ -18,15 +18,12 @@
 , gfortran
 , cudaCapabilities ? cudaPackages.cudaFlags.cudaCapabilities
 , gpuTargets ? [ ] # Non-CUDA targets, that is HIP
-, hip
-, hipblas
-, hipsparse
+, rocmPackages
 , lapack
 , lib
 , libpthreadstubs
 , magmaRelease
 , ninja
-, openmp
 , rocmSupport ? false
 , static ? false
 , stdenv
@@ -47,7 +44,7 @@ let
   # NOTE: The hip.gpuTargets are prefixed with "gfx" instead of "sm" like cudaFlags.realArches.
   #   For some reason, Magma's CMakeLists.txt file does not handle the "gfx" prefix, so we must
   #   remove it.
-  rocmArches = lists.map (x: strings.removePrefix "gfx" x) hip.gpuTargets;
+  rocmArches = lists.map (x: strings.removePrefix "gfx" x) rocmPackages.clr.gpuTargets;
   supportedRocmArches = lists.intersectLists rocmArches supportedGpuTargets;
   unsupportedRocmArches = lists.subtractLists supportedRocmArches rocmArches;
 
@@ -125,10 +122,10 @@ stdenv.mkDerivation {
   ] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [
     cuda_profiler_api.dev # <cuda_profiler_api.h>
   ]) ++ lists.optionals rocmSupport [
-    hip
-    hipblas
-    hipsparse
-    openmp
+    rocmPackages.clr
+    rocmPackages.hipblas
+    rocmPackages.hipsparse
+    rocmPackages.llvm.openmp
   ];
 
   cmakeFlags = [
@@ -142,8 +139,8 @@ stdenv.mkDerivation {
     "-DCMAKE_CXX_COMPILER=${backendStdenv.cc}/bin/c++"
     "-DMAGMA_ENABLE_CUDA=ON"
   ] ++ lists.optionals rocmSupport [
-    "-DCMAKE_C_COMPILER=${hip}/bin/hipcc"
-    "-DCMAKE_CXX_COMPILER=${hip}/bin/hipcc"
+    "-DCMAKE_C_COMPILER=${rocmPackages.clr}/bin/hipcc"
+    "-DCMAKE_CXX_COMPILER=${rocmPackages.clr}/bin/hipcc"
     "-DMAGMA_ENABLE_HIP=ON"
   ];
 
diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
index f9f6e377b139..c9c400b57bd5 100644
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@@ -43,11 +43,7 @@
 
   # ROCm dependencies
   rocmSupport ? false,
-  gpuTargets ? [ ],
-  openmp, rocm-core, hip, rccl, miopen, miopengemm, rocrand, rocblas,
-  rocfft, rocsparse, hipsparse, rocthrust, rocprim, hipcub, roctracer,
-  rocsolver, hipfft, hipsolver, hipblas, rocminfo, rocm-thunk, rocm-comgr,
-  rocm-device-libs, rocm-runtime, rocm-opencl-runtime, hipify
+  gpuTargets ? [ ], rocmPackages
 }:
 
 let
@@ -89,7 +85,7 @@ let
     else if cudaSupport then
       gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
     else if rocmSupport then
-      hip.gpuTargets
+      rocmPackages.clr.gpuTargets
     else
       throw "No GPU targets specified"
   );
@@ -97,12 +93,13 @@ let
   rocmtoolkit_joined = symlinkJoin {
     name = "rocm-merged";
 
-    paths = [
-      rocm-core hip rccl miopen miopengemm rocrand rocblas
-      rocfft rocsparse hipsparse rocthrust rocprim hipcub
-      roctracer rocfft rocsolver hipfft hipsolver hipblas
+    paths = with rocmPackages; [
+      rocm-core clr rccl miopen miopengemm rocrand rocblas
+      rocsparse hipsparse rocthrust rocprim hipcub
+      roctracer # Unfree at the moment due to hsa-amd-aqlprofile hard dependency in rocprofiler
+      rocfft rocsolver hipfft hipsolver hipblas
       rocminfo rocm-thunk rocm-comgr rocm-device-libs
-      rocm-runtime rocm-opencl-runtime hipify
+      rocm-runtime clr.icd hipify
     ];
   };
 
@@ -170,7 +167,7 @@ in buildPythonPackage rec {
     # Strangely, this is never set in cmake
     substituteInPlace cmake/public/LoadHIP.cmake \
       --replace "set(ROCM_PATH \$ENV{ROCM_PATH})" \
-        "set(ROCM_PATH \$ENV{ROCM_PATH})''\nset(ROCM_VERSION ${lib.concatStrings (lib.intersperse "0" (lib.splitString "." hip.version))})"
+        "set(ROCM_PATH \$ENV{ROCM_PATH})''\nset(ROCM_VERSION ${lib.concatStrings (lib.intersperse "0" (lib.splitString "." rocmPackages.clr.version))})"
   ''
   # Detection of NCCL version doesn't work particularly well when using the static binary.
   + lib.optionalString cudaSupport ''
@@ -323,7 +320,7 @@ in buildPythonPackage rec {
     ] ++ lists.optionals (strings.versionAtLeast cudaVersion "11.8") [
       cuda_profiler_api.dev # <cuda_profiler_api.h>
     ])
-    ++ lib.optionals rocmSupport [ openmp ]
+    ++ lib.optionals rocmSupport [ rocmPackages.llvm.openmp ]
     ++ lib.optionals (cudaSupport || rocmSupport) [ magma ]
     ++ lib.optionals stdenv.isLinux [ numactl ]
     ++ lib.optionals stdenv.isDarwin [ Accelerate CoreServices libobjc ];
diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index d365df0eb0f0..a62783c951af 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -30690,7 +30690,6 @@ with pkgs;
     # LLVM 11 crashes when compiling GHOST_SystemCocoa.mm
     stdenv = if stdenv.isDarwin then llvmPackages_10.stdenv else stdenv;
     inherit (darwin.apple_sdk.frameworks) Cocoa CoreGraphics ForceFeedback OpenAL OpenGL;
-    inherit (rocmPackages) hip;
   };
 
   blender-with-packages = callPackage ../applications/misc/blender/wrapper.nix { };
@@ -39262,10 +39261,7 @@ with pkgs;
 
   lie = callPackage ../applications/science/math/LiE { };
 
-  inherit (callPackage ../development/libraries/science/math/magma {
-    inherit (rocmPackages.llvm) openmp;
-    inherit (rocmPackages) hip hipblas hipsparse;
-  }) magma magma_2_7_2 magma_2_6_2;
+  inherit (callPackage ../development/libraries/science/math/magma { }) magma magma_2_7_2 magma_2_6_2;
 
   magma-cuda = magma.override {
     cudaSupport = true;
@@ -39276,8 +39272,7 @@ with pkgs;
     static = true;
   };
 
-  # TODO:AMD won't compile with anything newer than 2.6.2 -- it fails at the linking stage.
-  magma-hip = magma_2_6_2.override {
+  magma-hip = magma.override {
     cudaSupport = false;
     rocmSupport = true;
   };
diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix
index c0857bf850f8..16751903fe50 100644
--- a/pkgs/top-level/python-packages.nix
+++ b/pkgs/top-level/python-packages.nix
@@ -13904,7 +13904,6 @@ self: super: with self; {
       else pkgs.magma;
     inherit (pkgs.darwin.apple_sdk.frameworks) Accelerate CoreServices;
     inherit (pkgs.darwin) libobjc;
-    inherit (pkgs.rocmPackages.llvm) openmp;
   };
 
   torch-bin = callPackage ../development/python-modules/torch/bin.nix {

From 691b69fd80e1253e15164c4db8f9cec5a322279a Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Fri, 6 Oct 2023 18:18:24 -0500
Subject: [PATCH 25/30] python3Packages.openai-triton: Use custom LLVM and
 refactor

This project does not rely on ROCM's LLVM fork! Using it can cause a lot of problems when it upgrades!
---
 .../python-modules/openai-triton/default.nix  |  222 +-
 .../python-modules/openai-triton/llvm.nix     |  112 +
 .../python-modules/openai-triton/llvm15.patch | 4617 -----------------
 pkgs/top-level/python-packages.nix            |    2 +-
 4 files changed, 188 insertions(+), 4765 deletions(-)
 create mode 100644 pkgs/development/python-modules/openai-triton/llvm.nix
 delete mode 100644 pkgs/development/python-modules/openai-triton/llvm15.patch

diff --git a/pkgs/development/python-modules/openai-triton/default.nix b/pkgs/development/python-modules/openai-triton/default.nix
index 0e10642f0693..68dfc24aafda 100644
--- a/pkgs/development/python-modules/openai-triton/default.nix
+++ b/pkgs/development/python-modules/openai-triton/default.nix
@@ -1,12 +1,13 @@
 { lib
+, callPackage
 , buildPythonPackage
-, python
-, fetchpatch
 , fetchFromGitHub
 , addOpenGLRunpath
+, pytestCheckHook
+, pythonRelaxDepsHook
+, pkgsTargetTarget
 , cmake
-, cudaPackages
-, llvmPackages
+, ninja
 , pybind11
 , gtest
 , zlib
@@ -15,18 +16,11 @@
 , lit
 , filelock
 , torchWithRocm
-, pytest
-, pytestCheckHook
-, pythonRelaxDepsHook
-, pkgsTargetTarget
+, python
+, cudaPackages
 }:
 
 let
-  pname = "triton";
-  version = "2.0.0";
-
-  inherit (cudaPackages) cuda_cudart backendStdenv;
-
   # A time may come we'll want to be cross-friendly
   #
   # Short explanation: we need pkgsTargetTarget, because we use string
@@ -38,20 +32,12 @@ let
   # pkgsTargetTarget maybe doesn't matter, because ptxas compiles programs to
   # be executed on the GPU.
   # Cf. https://nixos.org/manual/nixpkgs/unstable/#sec-cross-infra
-  ptxas = "${pkgsTargetTarget.cudaPackages.cuda_nvcc}/bin/ptxas";
-
-  llvm = (llvmPackages.llvm.override {
-    llvmTargetsToBuild = [ "NATIVE" "NVPTX" ];
-    # Upstream CI sets these too:
-    # targetProjects = [ "mlir" ];
-    extraCMakeFlags = [
-      "-DLLVM_INSTALL_UTILS=ON"
-    ];
-  });
+  ptxas = "${pkgsTargetTarget.cudaPackages.cuda_nvcc}/bin/ptxas"; # Make sure cudaPackages is the right version each update (See python/setup.py)
+  llvm = callPackage ./llvm.nix { }; # Use a custom llvm, see llvm.nix for details
 in
-buildPythonPackage {
-  inherit pname version;
-
+buildPythonPackage rec {
+  pname = "triton";
+  version = "2.0.0";
   format = "setuptools";
 
   src = fetchFromGitHub {
@@ -62,21 +48,6 @@ buildPythonPackage {
   };
 
   patches = [
-    # Prerequisite for llvm15 patch
-    (fetchpatch {
-      url = "https://github.com/openai/triton/commit/2aba985daaa70234823ea8f1161da938477d3e02.patch";
-      hash = "sha256-LGv0+Ut2WYPC4Ksi4803Hwmhi3FyQOF9zElJc/JCobk=";
-    })
-    (fetchpatch {
-      url = "https://github.com/openai/triton/commit/e3941f9d09cdd31529ba4a41018cfc0096aafea6.patch";
-      hash = "sha256-A+Gor6qzFlGQhVVhiaaYOzqqx8yO2MdssnQS6TIfUWg=";
-    })
-
-    # Source: https://github.com/openai/triton/commit/fc7a8e35819bda632bdcf1cf75fd9abe4d4e077a.patch
-    # The original patch adds ptxas binary, so we include our own clean copy
-    # Drop with the next update
-    ./llvm15.patch
-
     # TODO: there have been commits upstream aimed at removing the "torch"
     # circular dependency, but the patches fail to apply on the release
     # revision. Keeping the link for future reference
@@ -88,70 +59,11 @@ buildPythonPackage {
     # })
   ];
 
-  postPatch = ''
-    substituteInPlace python/setup.py \
-      --replace \
-        '= get_thirdparty_packages(triton_cache_path)' \
-        '= os.environ["cmakeFlags"].split()'
-  ''
-  # Wiring triton=2.0.0 with llcmPackages_rocm.llvm=5.4.3
-  # Revisit when updating either triton or llvm
-  + ''
-    substituteInPlace CMakeLists.txt \
-      --replace "nvptx" "NVPTX" \
-      --replace "LLVM 11" "LLVM"
-    sed -i '/AddMLIR/a set(MLIR_TABLEGEN_EXE "${llvmPackages.mlir}/bin/mlir-tblgen")' CMakeLists.txt
-    sed -i '/AddMLIR/a set(MLIR_INCLUDE_DIR ''${MLIR_INCLUDE_DIRS})' CMakeLists.txt
-    find -iname '*.td' -exec \
-      sed -i \
-      -e '\|include "mlir/IR/OpBase.td"|a include "mlir/IR/AttrTypeBase.td"' \
-      -e 's|include "mlir/Dialect/StandardOps/IR/Ops.td"|include "mlir/Dialect/Func/IR/FuncOps.td"|' \
-      '{}' ';'
-    substituteInPlace unittest/CMakeLists.txt --replace "include(GoogleTest)" "find_package(GTest REQUIRED)"
-    sed -i 's/^include.*$//' unittest/CMakeLists.txt
-    sed -i '/LINK_LIBS/i NVPTXInfo' lib/Target/PTX/CMakeLists.txt
-    sed -i '/LINK_LIBS/i NVPTXCodeGen' lib/Target/PTX/CMakeLists.txt
-  ''
-  # TritonMLIRIR already links MLIRIR. Not transitive?
-  # + ''
-  #   echo "target_link_libraries(TritonPTX PUBLIC MLIRIR)" >> lib/Target/PTX/CMakeLists.txt
-  # ''
-  # Already defined in llvm, when built with -DLLVM_INSTALL_UTILS
-  + ''
-    substituteInPlace bin/CMakeLists.txt \
-      --replace "add_subdirectory(FileCheck)" ""
-
-    rm cmake/FindLLVM.cmake
-  ''
-  +
-  (
-    let
-      # Bash was getting weird without linting,
-      # but basically upstream contains [cc, ..., "-lcuda", ...]
-      # and we replace it with [..., "-lcuda", "-L/run/opengl-driver/lib", "-L$stubs", ...]
-      old = [ "-lcuda" ];
-      new = [ "-lcuda" "-L${addOpenGLRunpath.driverLink}" "-L${cuda_cudart}/lib/stubs/" ];
-
-      quote = x: ''"${x}"'';
-      oldStr = lib.concatMapStringsSep ", " quote old;
-      newStr = lib.concatMapStringsSep ", " quote new;
-    in
-    ''
-      substituteInPlace python/triton/compiler.py \
-        --replace '${oldStr}' '${newStr}'
-    ''
-  )
-  # Triton seems to be looking up cuda.h
-  + ''
-    sed -i 's|cu_include_dir = os.path.join.*$|cu_include_dir = "${cuda_cudart}/include"|' python/triton/compiler.py
-  '';
-
   nativeBuildInputs = [
-    cmake
     pythonRelaxDepsHook
-
-    # Requires torch (circular dependency) and probably needs GPUs:
-    # pytestCheckHook
+    # pytestCheckHook # Requires torch (circular dependency) and probably needs GPUs:
+    cmake
+    ninja
 
     # Note for future:
     # These *probably* should go in depsTargetTarget
@@ -159,7 +71,6 @@ buildPythonPackage {
     # because we only support cudaPackages on x86_64-linux atm
     lit
     llvm
-    llvmPackages.mlir
   ];
 
   buildInputs = [
@@ -170,17 +81,44 @@ buildPythonPackage {
     zlib
   ];
 
-  propagatedBuildInputs = [
-    filelock
-  ];
+  propagatedBuildInputs = [ filelock ];
+
+  postPatch = let
+    # Bash was getting weird without linting,
+    # but basically upstream contains [cc, ..., "-lcuda", ...]
+    # and we replace it with [..., "-lcuda", "-L/run/opengl-driver/lib", "-L$stubs", ...]
+    old = [ "-lcuda" ];
+    new = [ "-lcuda" "-L${addOpenGLRunpath.driverLink}" "-L${cudaPackages.cuda_cudart}/lib/stubs/" ];
+
+    quote = x: ''"${x}"'';
+    oldStr = lib.concatMapStringsSep ", " quote old;
+    newStr = lib.concatMapStringsSep ", " quote new;
+  in ''
+    # Use our `cmakeFlags` instead and avoid downloading dependencies
+    substituteInPlace python/setup.py \
+      --replace "= get_thirdparty_packages(triton_cache_path)" "= os.environ[\"cmakeFlags\"].split()"
+
+    # Already defined in llvm, when built with -DLLVM_INSTALL_UTILS
+    substituteInPlace bin/CMakeLists.txt \
+      --replace "add_subdirectory(FileCheck)" ""
+
+    # Use our linker flags
+    substituteInPlace python/triton/compiler.py \
+      --replace '${oldStr}' '${newStr}'
+
+    # Don't fetch googletest
+    substituteInPlace unittest/CMakeLists.txt \
+      --replace "include (\''${CMAKE_CURRENT_SOURCE_DIR}/googletest.cmake)" ""\
+      --replace "include(GoogleTest)" "find_package(GTest REQUIRED)"
+  '';
 
   # Avoid GLIBCXX mismatch with other cuda-enabled python packages
   preConfigure = ''
-    export CC="${backendStdenv.cc}/bin/cc";
-    export CXX="${backendStdenv.cc}/bin/c++";
+    export CC=${cudaPackages.backendStdenv.cc}/bin/cc;
+    export CXX=${cudaPackages.backendStdenv.cc}/bin/c++;
 
     # Upstream's setup.py tries to write cache somewhere in ~/
-    export HOME=$TMPDIR
+    export HOME=$(mktemp -d)
 
     # Upstream's github actions patch setup.cfg to write base-dir. May be redundant
     echo "
@@ -188,52 +126,41 @@ buildPythonPackage {
     base-dir=$PWD" >> python/setup.cfg
 
     # The rest (including buildPhase) is relative to ./python/
-    cd python/
+    cd python
 
     # Work around download_and_copy_ptxas()
-    dst_cuda="$PWD/triton/third_party/cuda/bin"
-    mkdir -p "$dst_cuda"
-    ln -s "${ptxas}" "$dst_cuda/"
+    mkdir -p $PWD/triton/third_party/cuda/bin
+    ln -s ${ptxas} $PWD/triton/third_party/cuda/bin
   '';
 
   # CMake is run by setup.py instead
   dontUseCmakeConfigure = true;
-  cmakeFlags = [
-    "-DMLIR_DIR=${llvmPackages.mlir}/lib/cmake/mlir"
-  ];
 
-  postFixup =
-    let
-      ptxasDestination = "$out/${python.sitePackages}/triton/third_party/cuda/bin/ptxas";
-    in
-    # Setuptools (?) strips runpath and +x flags. Let's just restore the symlink
-    ''
-      rm -f ${ptxasDestination}
-      ln -s ${ptxas} ${ptxasDestination}
-    '';
+  # Setuptools (?) strips runpath and +x flags. Let's just restore the symlink
+  postFixup = ''
+    rm -f $out/${python.sitePackages}/triton/third_party/cuda/bin/ptxas
+    ln -s ${ptxas} $out/${python.sitePackages}/triton/third_party/cuda/bin/ptxas
+  '';
 
-  checkInputs = [
-    cmake # ctest
-  ];
+  checkInputs = [ cmake ]; # ctest
   dontUseSetuptoolsCheck = true;
-  preCheck =
+
+  preCheck = ''
     # build/temp* refers to build_ext.build_temp (looked up in the build logs)
-    ''
-      (cd /build/source/python/build/temp* ; ctest)
-    '' # For pytestCheckHook
-    + ''
-      cd test/unit
-    '';
-  pythonImportsCheck = [
-    # Circular dependency on torch
-    # "triton"
-    # "triton.language"
-  ];
+    (cd /build/source/python/build/temp* ; ctest)
+
+    # For pytestCheckHook
+    cd test/unit
+  '';
+
+  # Circular dependency on torch
+  # pythonImportsCheck = [
+  #   "triton"
+  #   "triton.language"
+  # ];
 
   # Ultimately, torch is our test suite:
-  passthru.tests = {
-    inherit torchWithRocm;
-  };
+  passthru.tests = { inherit torchWithRocm; };
 
   pythonRemoveDeps = [
     # Circular dependency, cf. https://github.com/openai/triton/issues/1374
@@ -243,11 +170,12 @@ buildPythonPackage {
     "cmake"
     "lit"
   ];
+
   meta = with lib; {
-    description = "Development repository for the Triton language and compiler";
-    homepage = "https://github.com/openai/triton/";
+    description = "Language and compiler for writing highly efficient custom Deep-Learning primitives";
+    homepage = "https://github.com/openai/triton";
     platforms = lib.platforms.unix;
     license = licenses.mit;
-    maintainers = with maintainers; [ SomeoneSerge ];
+    maintainers = with maintainers; [ SomeoneSerge Madouura ];
   };
 }
diff --git a/pkgs/development/python-modules/openai-triton/llvm.nix b/pkgs/development/python-modules/openai-triton/llvm.nix
new file mode 100644
index 000000000000..6ac0d9f5738c
--- /dev/null
+++ b/pkgs/development/python-modules/openai-triton/llvm.nix
@@ -0,0 +1,112 @@
+{ lib
+, stdenv
+, fetchFromGitHub
+, pkg-config
+, cmake
+, ninja
+, git
+, doxygen
+, sphinx
+, libxml2
+, libxcrypt
+, libedit
+, libffi
+, mpfr
+, zlib
+, ncurses
+, python3Packages
+, buildDocs ? true
+, buildMan ? true
+, buildTests ? true
+}:
+
+stdenv.mkDerivation (finalAttrs: {
+  pname = "triton-llvm";
+  version = "14.0.6-f28c006a5895";
+
+  outputs = [
+    "out"
+  ] ++ lib.optionals buildDocs [
+    "doc"
+  ] ++ lib.optionals buildMan [
+    "man"
+  ];
+
+  # See https://github.com/openai/triton/blob/main/python/setup.py and https://github.com/ptillet/triton-llvm-releases/releases
+  src = fetchFromGitHub {
+    owner = "llvm";
+    repo = "llvm-project";
+    rev = "f28c006a5895fc0e329fe15fead81e37457cb1d1";
+    hash = "sha256-vffu4HilvYwtzwgq+NlS26m65DGbp6OSSne2aje1yJE=";
+  };
+
+  nativeBuildInputs = [
+    pkg-config
+    cmake
+    ninja
+    git
+    python3Packages.python
+  ] ++ lib.optionals (buildDocs || buildMan) [
+    doxygen
+    sphinx
+    python3Packages.recommonmark
+  ];
+
+  buildInputs = [
+    libxml2
+    libxcrypt
+    libedit
+    libffi
+    mpfr
+  ];
+
+  propagatedBuildInputs = [
+    zlib
+    ncurses
+  ];
+
+  sourceRoot = "${finalAttrs.src.name}/llvm";
+
+  cmakeFlags = [
+    "-DLLVM_TARGETS_TO_BUILD=X86;AMDGPU;NVPTX"
+    "-DLLVM_ENABLE_PROJECTS=llvm;mlir"
+    "-DLLVM_INSTALL_UTILS=ON"
+  ] ++ lib.optionals (buildDocs || buildMan) [
+    "-DLLVM_INCLUDE_DOCS=ON"
+    "-DMLIR_INCLUDE_DOCS=ON"
+    "-DLLVM_BUILD_DOCS=ON"
+    # "-DLLVM_ENABLE_DOXYGEN=ON" Way too slow, only uses one core
+    "-DLLVM_ENABLE_SPHINX=ON"
+    "-DSPHINX_OUTPUT_HTML=ON"
+    "-DSPHINX_OUTPUT_MAN=ON"
+    "-DSPHINX_WARNINGS_AS_ERRORS=OFF"
+  ] ++ lib.optionals buildTests [
+    "-DLLVM_INCLUDE_TESTS=ON"
+    "-DMLIR_INCLUDE_TESTS=ON"
+    "-DLLVM_BUILD_TESTS=ON"
+  ];
+
+  postPatch = ''
+    # `CMake Error: cannot write to file "/build/source/llvm/build/lib/cmake/mlir/MLIRTargets.cmake": Permission denied`
+    chmod +w -R ../mlir
+
+    # FileSystem permissions tests fail with various special bits
+    rm test/tools/llvm-objcopy/ELF/mirror-permissions-unix.test
+    rm unittests/Support/Path.cpp
+
+    substituteInPlace unittests/Support/CMakeLists.txt \
+      --replace "Path.cpp" ""
+  '';
+
+  doCheck = buildTests;
+  requiredSystemFeatures = [ "big-parallel" ];
+
+  meta = with lib; {
+    description = "Collection of modular and reusable compiler and toolchain technologies";
+    homepage = "https://github.com/llvm/llvm-project";
+    license = with licenses; [ ncsa ];
+    maintainers = with maintainers; [ SomeoneSerge Madouura ];
+    platforms = platforms.linux;
+    broken = stdenv.isAarch64; # https://github.com/RadeonOpenCompute/ROCm/issues/1831#issuecomment-1278205344
+  };
+})
diff --git a/pkgs/development/python-modules/openai-triton/llvm15.patch b/pkgs/development/python-modules/openai-triton/llvm15.patch
deleted file mode 100644
index 3e20cce23801..000000000000
--- a/pkgs/development/python-modules/openai-triton/llvm15.patch
+++ /dev/null
@@ -1,4617 +0,0 @@
-From fc7a8e35819bda632bdcf1cf75fd9abe4d4e077a Mon Sep 17 00:00:00 2001
-From: Christian Sigg <chsigg@users.noreply.github.com>
-Date: Thu, 16 Feb 2023 15:40:53 +0100
-Subject: [PATCH] Rebase Triton to LLVM-15. (#1070)
-
-This PR rebases Triton from LLVM-14 to LLVM-15. Most changes are
-mechanical, except for the analysis framework changes.
----
- CMakeLists.txt                                |   6 +-
- bin/CMakeLists.txt                            |   2 +-
- bin/FileCheck/FileCheck.cpp                   |   3 +
- bin/triton-opt.cpp                            |   6 +-
- bin/triton-translate.cpp                      |   7 +-
- include/triton/Analysis/Alias.h               |  21 +-
- include/triton/Analysis/Allocation.h          |   2 +
- include/triton/Analysis/AxisInfo.h            |  56 ++-
- include/triton/Analysis/Utility.h             |   6 +-
- include/triton/Conversion/Passes.td           |   4 +-
- include/triton/Dialect/Triton/IR/Dialect.h    |   7 +-
- .../triton/Dialect/Triton/IR/TritonDialect.td |   8 +-
- include/triton/Dialect/Triton/IR/TritonOps.td |  12 +-
- .../triton/Dialect/Triton/IR/TritonTypes.td   |   2 +
- .../Dialect/Triton/Transforms/Passes.td       |   3 +-
- include/triton/Dialect/TritonGPU/IR/Dialect.h |   4 +-
- .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td |   7 +
- .../Dialect/TritonGPU/IR/TritonGPUDialect.td  |   2 +-
- .../Dialect/TritonGPU/IR/TritonGPUOps.td      |  13 +-
- lib/Analysis/Alias.cpp                        |  14 +-
- lib/Analysis/Allocation.cpp                   |  30 +-
- lib/Analysis/AxisInfo.cpp                     |  79 ++--
- lib/Analysis/CMakeLists.txt                   |   2 +-
- lib/Analysis/Membar.cpp                       |   2 +-
- lib/Analysis/Utility.cpp                      |  54 +++
- .../TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp |   3 -
- lib/Conversion/TritonGPUToLLVM/DotOpHelpers.h |  10 +-
- .../TritonGPUToLLVM/DotOpToLLVM.cpp           |   5 -
- .../TritonGPUToLLVM/ElementwiseOpToLLVM.cpp   |   2 -
- .../TritonGPUToLLVM/LoadStoreOpToLLVM.cpp     |   5 +-
- .../TritonGPUToLLVM/ReduceOpToLLVM.cpp        |   2 -
- .../TritonGPUToLLVM/TritonGPUToLLVM.cpp       |   7 +-
- .../TritonGPUToLLVM/TritonGPUToLLVMBase.h     |  26 +-
- .../TritonGPUToLLVM/TritonGPUToLLVMPass.cpp   |  52 +--
- lib/Conversion/TritonGPUToLLVM/Utility.h      |   5 +-
- .../TritonToTritonGPUPass.cpp                 |  69 ++--
- lib/Dialect/Triton/IR/CMakeLists.txt          |  10 +-
- lib/Dialect/Triton/IR/Ops.cpp                 |  34 +-
- lib/Dialect/Triton/Transforms/Combine.cpp     |   6 +-
- lib/Dialect/Triton/Transforms/Combine.td      |   2 +-
- lib/Dialect/TritonGPU/IR/Dialect.cpp          |  27 +-
- lib/Dialect/TritonGPU/Transforms/Coalesce.cpp |  20 +-
- lib/Dialect/TritonGPU/Transforms/Combine.cpp  |   2 +-
- lib/Dialect/TritonGPU/Transforms/Combine.td   |   1 +
- .../Transforms/DecomposeConversions.cpp       |   2 +-
- lib/Dialect/TritonGPU/Transforms/Pipeline.cpp |  10 +-
- .../Transforms/ReorderInstructions.cpp        |   2 +-
- .../Transforms/TritonGPUConversion.cpp        |  12 +-
- .../Transforms/UpdateMmaForVolta.cpp          |   6 +-
- lib/Dialect/TritonGPU/Transforms/Utility.cpp  |   2 +-
- lib/Target/LLVMIR/CMakeLists.txt              |   3 +-
- lib/Target/PTX/PTXTranslation.cpp             |   3 +
- python/setup.py                               |  15 +-
- python/src/triton.cc                          |  85 +++--
- python/test/unit/language/test_core.py        |   2 +-
- python/triton/compiler.py                     |   4 +-
- test/Analysis/test-alias.mlir                 |  24 +-
- test/Analysis/test-alignment.mlir             | 344 +++++++++---------
- test/Analysis/test-allocation.mlir            |  32 +-
- test/Analysis/test-membar.mlir                |  38 +-
- test/Conversion/triton_ops.mlir               |  10 +-
- test/Conversion/triton_to_tritongpu.mlir      |   6 +-
- test/Conversion/tritongpu_to_llvm.mlir        |  94 ++---
- test/Target/tritongpu_to_llvmir.mlir          |   4 +-
- test/Target/tritongpu_to_ptx.mlir             |   2 +-
- test/Triton/combine.mlir                      |  40 +-
- test/Triton/vecadd.mlir                       |   4 +-
- test/TritonGPU/coalesce.mlir                  |   2 +-
- test/TritonGPU/combine.mlir                   |  38 +-
- test/TritonGPU/loop-pipeline.mlir             |  22 +-
- test/TritonGPU/matmul.mlir                    |   4 +-
- test/TritonGPU/prefetch.mlir                  |   4 +-
- test/TritonGPU/update-mma-for-volta.mlir      |   4 +-
- test/lib/Analysis/TestAlias.cpp               |  29 +-
- test/lib/Analysis/TestAllocation.cpp          |   5 +-
- test/lib/Analysis/TestAxisInfo.cpp            |  51 +--
- test/lib/Analysis/TestMembar.cpp              |   7 +-
- 78 files changed, 808 insertions(+), 742 deletions(-)
-
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index d0d361fc7c..b281a28400 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -1,4 +1,7 @@
- cmake_minimum_required(VERSION 3.6)
-+
-+cmake_policy(SET CMP0116 OLD)
-+
- include(ExternalProject)
- 
- set(CMAKE_CXX_STANDARD 17)
-@@ -155,7 +158,6 @@ if(TRITON_BUILD_PYTHON_MODULE)
-     endif()
- endif()
- 
--
- # # Triton
- # file(GLOB_RECURSE LIBTRITON_SRC lib/*.cc)
- # if (WIN32 AND TRITON_BUILD_PYTHON_MODULE)
-@@ -212,7 +214,7 @@ if(TRITON_BUILD_PYTHON_MODULE)
-     # optimizations
-     MLIRPass
-     MLIRTransforms
--    MLIRLLVMIR
-+    MLIRLLVMDialect
-     MLIRSupport
-     MLIRTargetLLVMIRExport
-     MLIRExecutionEngine
-diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
-index 906f635f8b..695b3479fd 100644
---- a/bin/CMakeLists.txt
-+++ b/bin/CMakeLists.txt
-@@ -48,7 +48,7 @@ llvm_update_compile_flags(triton-translate)
-          # MLIR core
-          MLIROptLib
-          MLIRIR
--         MLIRLLVMIR
-+         MLIRLLVMDialect
-          MLIRPass
-          MLIRSupport
-          MLIRTransforms
-diff --git a/bin/FileCheck/FileCheck.cpp b/bin/FileCheck/FileCheck.cpp
-index 819efc3541..9ac6f1b277 100644
---- a/bin/FileCheck/FileCheck.cpp
-+++ b/bin/FileCheck/FileCheck.cpp
-@@ -19,6 +19,7 @@
- #include "llvm/Support/CommandLine.h"
- #include "llvm/Support/InitLLVM.h"
- #include "llvm/Support/Process.h"
-+#include "llvm/Support/SourceMgr.h"
- #include "llvm/Support/WithColor.h"
- #include "llvm/Support/raw_ostream.h"
- #include <cmath>
-@@ -360,6 +361,8 @@ static std::string GetCheckTypeAbbreviation(Check::FileCheckType Ty) {
-     return "bad-not";
-   case Check::CheckBadCount:
-     return "bad-count";
-+  case Check::CheckMisspelled:
-+    return "misspelled";
-   case Check::CheckNone:
-     llvm_unreachable("invalid FileCheckType");
-   }
-diff --git a/bin/triton-opt.cpp b/bin/triton-opt.cpp
-index 9f3b53b7ae..f96232e1b0 100644
---- a/bin/triton-opt.cpp
-+++ b/bin/triton-opt.cpp
-@@ -8,7 +8,7 @@
- 
- #include "mlir/IR/Dialect.h"
- #include "mlir/InitAllPasses.h"
--#include "mlir/Support/MlirOptMain.h"
-+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
- 
- namespace mlir {
- namespace test {
-@@ -33,8 +33,8 @@ int main(int argc, char **argv) {
-   // TODO: register Triton & TritonGPU passes
-   mlir::DialectRegistry registry;
-   registry.insert<mlir::triton::TritonDialect,
--                  mlir::triton::gpu::TritonGPUDialect, mlir::math::MathDialect,
--                  mlir::arith::ArithmeticDialect, mlir::StandardOpsDialect,
-+                  mlir::triton::gpu::TritonGPUDialect, mlir::func::FuncDialect,
-+                  mlir::math::MathDialect, mlir::arith::ArithmeticDialect,
-                   mlir::scf::SCFDialect, mlir::gpu::GPUDialect>();
- 
-   return mlir::asMainReturnCode(mlir::MlirOptMain(
-diff --git a/bin/triton-translate.cpp b/bin/triton-translate.cpp
-index 05ba15e453..56b5d65857 100644
---- a/bin/triton-translate.cpp
-+++ b/bin/triton-translate.cpp
-@@ -3,7 +3,7 @@
- #include "mlir/IR/AsmState.h"
- #include "mlir/IR/BuiltinOps.h"
- #include "mlir/IR/Dialect.h"
--#include "mlir/Parser.h"
-+#include "mlir/Parser/Parser.h"
- #include "mlir/Pass/Pass.h"
- #include "mlir/Pass/PassManager.h"
- #include "mlir/Support/FileUtilities.h"
-@@ -38,7 +38,7 @@ OwningOpRef<ModuleOp> loadMLIRModule(llvm::StringRef inputFilename,
-   mlir::DialectRegistry registry;
-   registry.insert<TritonDialect, triton::gpu::TritonGPUDialect,
-                   mlir::math::MathDialect, arith::ArithmeticDialect,
--                  StandardOpsDialect, scf::SCFDialect>();
-+                  scf::SCFDialect>();
- 
-   context.appendDialectRegistry(registry);
- 
-@@ -50,7 +50,8 @@ OwningOpRef<ModuleOp> loadMLIRModule(llvm::StringRef inputFilename,
-     context.loadAllAvailableDialects();
-     context.allowUnregisteredDialects();
- 
--    OwningOpRef<ModuleOp> module(parseSourceFile(sourceMgr, &context));
-+    OwningOpRef<ModuleOp> module =
-+        parseSourceFile<ModuleOp>(sourceMgr, &context);
-     if (!module) {
-       llvm::errs() << "Parse MLIR file failed.";
-       return nullptr;
-diff --git a/include/triton/Analysis/Alias.h b/include/triton/Analysis/Alias.h
-index fa6b906fc9..631df518bc 100644
---- a/include/triton/Analysis/Alias.h
-+++ b/include/triton/Analysis/Alias.h
-@@ -2,7 +2,7 @@
- #define TRITON_ANALYSIS_ALIAS_H
- 
- #include "mlir/Analysis/AliasAnalysis.h"
--#include "mlir/Analysis/DataFlowAnalysis.h"
-+#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
- #include "llvm/ADT/DenseSet.h"
- 
- namespace mlir {
-@@ -21,7 +21,7 @@ class AliasInfo {
-   }
- 
-   /// The pessimistic value state of a value without alias
--  static AliasInfo getPessimisticValueState(MLIRContext *context) {
-+  static AliasInfo getPessimisticValueState(MLIRContext *context = nullptr) {
-     return AliasInfo();
-   }
-   static AliasInfo getPessimisticValueState(Value value) { return AliasInfo(); }
-@@ -29,6 +29,10 @@ class AliasInfo {
-   /// The union of both arguments
-   static AliasInfo join(const AliasInfo &lhs, const AliasInfo &rhs);
- 
-+  void print(raw_ostream &os) const {
-+    llvm::interleaveComma(allocs, os, [&](Value alloc) { alloc.print(os); });
-+  }
-+
- private:
-   /// The set of allocated values that are aliased by this lattice.
-   /// For now, we only consider aliased value produced by the following
-@@ -58,9 +62,13 @@ class AliasInfo {
- //===----------------------------------------------------------------------===//
- // Shared Memory Alias Analysis
- //===----------------------------------------------------------------------===//
--class SharedMemoryAliasAnalysis : public ForwardDataFlowAnalysis<AliasInfo> {
-+class SharedMemoryAliasAnalysis
-+    : public dataflow::SparseDataFlowAnalysis<dataflow::Lattice<AliasInfo>> {
- public:
--  using ForwardDataFlowAnalysis<AliasInfo>::ForwardDataFlowAnalysis;
-+  using dataflow::SparseDataFlowAnalysis<
-+      dataflow::Lattice<AliasInfo>>::SparseDataFlowAnalysis;
-+  using dataflow::SparseDataFlowAnalysis<
-+      dataflow::Lattice<AliasInfo>>::getLatticeElement;
- 
-   /// XXX(Keren): Compatible interface with MLIR AliasAnalysis for future use.
-   /// Given two values, returns their aliasing behavior.
-@@ -70,9 +78,10 @@ class SharedMemoryAliasAnalysis : public ForwardDataFlowAnalysis<AliasInfo> {
-   ModRefResult getModRef(Operation *op, Value location);
- 
-   /// Computes if the alloc set of the results are changed.
--  ChangeResult
-+  void
-   visitOperation(Operation *op,
--                 ArrayRef<LatticeElement<AliasInfo> *> operands) override;
-+                 ArrayRef<const dataflow::Lattice<AliasInfo> *> operands,
-+                 ArrayRef<dataflow::Lattice<AliasInfo> *> results) override;
- };
- 
- } // namespace mlir
-diff --git a/include/triton/Analysis/Allocation.h b/include/triton/Analysis/Allocation.h
-index b7c136d602..89b77034cc 100644
---- a/include/triton/Analysis/Allocation.h
-+++ b/include/triton/Analysis/Allocation.h
-@@ -188,6 +188,8 @@ class Allocation {
-   friend class triton::AllocationAnalysis;
- };
- 
-+template <typename T> Interval(T, T) -> Interval<T>;
-+
- } // namespace mlir
- 
- #endif // TRITON_ANALYSIS_ALLOCATION_H
-diff --git a/include/triton/Analysis/AxisInfo.h b/include/triton/Analysis/AxisInfo.h
-index fdfbd8fbb3..7083b9c43b 100644
---- a/include/triton/Analysis/AxisInfo.h
-+++ b/include/triton/Analysis/AxisInfo.h
-@@ -1,9 +1,10 @@
- #ifndef TRITON_ANALYSIS_AXISINFO_H
- #define TRITON_ANALYSIS_AXISINFO_H
- 
--#include "mlir/Analysis/DataFlowAnalysis.h"
-+#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
- #include "llvm/Support/raw_ostream.h"
- 
-+#include "mlir/Support/LLVM.h"
- #include "triton/Analysis/Utility.h"
- #include "triton/Dialect/Triton/IR/Dialect.h"
- #include "triton/Dialect/TritonGPU/IR/Dialect.h"
-@@ -62,7 +63,7 @@ class AxisInfo {
-   }
- 
-   /// The pessimistic value state of the contiguity is unknown.
--  static AxisInfo getPessimisticValueState(MLIRContext *context) {
-+  static AxisInfo getPessimisticValueState(MLIRContext *context = nullptr) {
-     return AxisInfo();
-   }
-   static AxisInfo getPessimisticValueState(Value value);
-@@ -70,6 +71,22 @@ class AxisInfo {
-   /// The gcd of both arguments for each dimension
-   static AxisInfo join(const AxisInfo &lhs, const AxisInfo &rhs);
- 
-+  void print(raw_ostream &os) const {
-+    auto print = [&](StringRef name, DimVectorT vec) {
-+      os << name << " = [";
-+      llvm::interleaveComma(vec, os);
-+      os << "]";
-+    };
-+    print("contiguity", contiguity);
-+    print(", divisibility", divisibility);
-+    print(", constancy", constancy);
-+    os << ", constant_value = ";
-+    if (constantValue)
-+      os << *constantValue;
-+    else
-+      os << "<none>";
-+  }
-+
- private:
-   /// The _contiguity_ information maps the `d`-th
-   /// dimension to the length of the shortest
-@@ -147,7 +164,8 @@ class AxisInfoVisitor {
-   }
- 
-   virtual AxisInfo
--  getAxisInfo(Operation *op, ArrayRef<LatticeElement<AxisInfo> *> operands) = 0;
-+  getAxisInfo(Operation *op,
-+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) = 0;
- 
-   virtual bool match(Operation *op) = 0;
- };
-@@ -157,15 +175,16 @@ template <typename OpTy> class AxisInfoVisitorImpl : public AxisInfoVisitor {
- public:
-   using AxisInfoVisitor::AxisInfoVisitor;
- 
--  AxisInfo getAxisInfo(Operation *op,
--                       ArrayRef<LatticeElement<AxisInfo> *> operands) final {
-+  AxisInfo
-+  getAxisInfo(Operation *op,
-+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) final {
-     return getAxisInfo(cast<OpTy>(op), operands);
-   }
- 
-   bool match(Operation *op) final { return isa<OpTy>(op); }
- 
--  virtual AxisInfo getAxisInfo(OpTy op,
--                               ArrayRef<LatticeElement<AxisInfo> *> operands) {
-+  virtual AxisInfo
-+  getAxisInfo(OpTy op, ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) {
-     llvm_unreachable("Unimplemented getAxisInfo");
-   }
- };
-@@ -176,8 +195,9 @@ class BinaryOpVisitorImpl : public AxisInfoVisitorImpl<OpTy> {
- public:
-   using AxisInfoVisitorImpl<OpTy>::AxisInfoVisitorImpl;
- 
--  AxisInfo getAxisInfo(OpTy op,
--                       ArrayRef<LatticeElement<AxisInfo> *> operands) override {
-+  AxisInfo
-+  getAxisInfo(OpTy op,
-+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
-     auto lhsInfo = operands[0]->getValue();
-     auto rhsInfo = operands[1]->getValue();
-     auto rank = lhsInfo.getRank();
-@@ -230,7 +250,8 @@ class AxisInfoVisitorList {
-     (visitors.emplace_back(std::make_unique<Ts>()), ...);
-   }
- 
--  AxisInfo apply(Operation *op, ArrayRef<LatticeElement<AxisInfo> *> operands) {
-+  AxisInfo apply(Operation *op,
-+                 ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) {
-     for (auto &visitor : visitors)
-       if (visitor->match(op))
-         return visitor->getAxisInfo(op, operands);
-@@ -241,16 +262,19 @@ class AxisInfoVisitorList {
-   std::vector<std::unique_ptr<AxisInfoVisitor>> visitors;
- };
- 
--class AxisInfoAnalysis : public ForwardDataFlowAnalysis<AxisInfo> {
-+class AxisInfoAnalysis
-+    : public dataflow::SparseDataFlowAnalysis<dataflow::Lattice<AxisInfo>> {
- private:
-   AxisInfoVisitorList visitors;
- 
- public:
--  AxisInfoAnalysis(MLIRContext *context);
-+  AxisInfoAnalysis(DataFlowSolver &solver);
-+  using dataflow::SparseDataFlowAnalysis<
-+      dataflow::Lattice<AxisInfo>>::getLatticeElement;
- 
--  ChangeResult
--  visitOperation(Operation *op,
--                 ArrayRef<LatticeElement<AxisInfo> *> operands) override;
-+  void visitOperation(Operation *op,
-+                      ArrayRef<const dataflow::Lattice<AxisInfo> *> operands,
-+                      ArrayRef<dataflow::Lattice<AxisInfo> *> results) override;
- 
-   unsigned getPtrContiguity(Value ptr);
- 
-@@ -261,4 +285,4 @@ class AxisInfoAnalysis : public ForwardDataFlowAnalysis<AxisInfo> {
- 
- } // namespace mlir
- 
--#endif
-\ No newline at end of file
-+#endif
-diff --git a/include/triton/Analysis/Utility.h b/include/triton/Analysis/Utility.h
-index c5ac137dc1..ee7fadb59d 100644
---- a/include/triton/Analysis/Utility.h
-+++ b/include/triton/Analysis/Utility.h
-@@ -1,6 +1,7 @@
- #ifndef TRITON_ANALYSIS_UTILITY_H
- #define TRITON_ANALYSIS_UTILITY_H
- 
-+#include "mlir/Analysis/DataFlowFramework.h"
- #include "mlir/Analysis/SliceAnalysis.h"
- #include "triton/Dialect/TritonGPU/IR/Dialect.h"
- #include <algorithm>
-@@ -12,7 +13,7 @@ namespace mlir {
- class ReduceOpHelper {
- public:
-   explicit ReduceOpHelper(triton::ReduceOp op) : op(op) {
--    srcTy = op.operand().getType().cast<RankedTensorType>();
-+    srcTy = op.getOperand().getType().cast<RankedTensorType>();
-   }
- 
-   ArrayRef<int64_t> getSrcShape() { return srcTy.getShape(); }
-@@ -103,6 +104,9 @@ SetVector<Operation *>
- multiRootGetSlice(Operation *op, TransitiveFilter backwardFilter = nullptr,
-                   TransitiveFilter forwardFilter = nullptr);
- 
-+// Create a basic DataFlowSolver with constant and dead code analysis included.
-+std::unique_ptr<DataFlowSolver> createDataFlowSolver();
-+
- } // namespace mlir
- 
- #endif // TRITON_ANALYSIS_UTILITY_H
-diff --git a/include/triton/Conversion/Passes.td b/include/triton/Conversion/Passes.td
-index 70bb20b78e..be00eb2dac 100644
---- a/include/triton/Conversion/Passes.td
-+++ b/include/triton/Conversion/Passes.td
-@@ -12,7 +12,6 @@ def ConvertTritonToTritonGPU: Pass<"convert-triton-to-tritongpu", "mlir::ModuleO
- 
-     let dependentDialects = ["mlir::arith::ArithmeticDialect",
-                              "mlir::math::MathDialect",
--                             "mlir::StandardOpsDialect",
-                              // TODO: Does this pass depend on SCF?
-                              "mlir::scf::SCFDialect",
-                              "mlir::triton::TritonDialect",
-@@ -41,8 +40,7 @@ def ConvertTritonGPUToLLVM : Pass<"convert-triton-gpu-to-llvm", "mlir::ModuleOp"
-                              "mlir::tensor::TensorDialect",
-                              "mlir::triton::TritonDialect",
-                              "mlir::triton::gpu::TritonGPUDialect",
--                             "mlir::NVVM::NVVMDialect",
--                             "mlir::StandardOpsDialect"];
-+                             "mlir::NVVM::NVVMDialect"];
- 
-     let options = [
-         Option<"computeCapability", "compute-capability",
-diff --git a/include/triton/Dialect/Triton/IR/Dialect.h b/include/triton/Dialect/Triton/IR/Dialect.h
-index e8012a51df..15869e262e 100644
---- a/include/triton/Dialect/Triton/IR/Dialect.h
-+++ b/include/triton/Dialect/Triton/IR/Dialect.h
-@@ -1,14 +1,15 @@
- #ifndef TRITON_DIALECT_TRITON_IR_DIALECT_H_
- #define TRITON_DIALECT_TRITON_IR_DIALECT_H_
- 
-+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
-+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
-+#include "mlir/Dialect/Func/IR/FuncOps.h"
- #include "mlir/Dialect/Math/IR/Math.h"
--#include "mlir/Dialect/SCF/SCF.h"
--#include "mlir/Dialect/StandardOps/IR/Ops.h"
-+#include "mlir/Dialect/SCF/IR/SCF.h"
- #include "mlir/Dialect/Tensor/IR/Tensor.h"
- #include "mlir/IR/BuiltinOps.h"
- #include "mlir/IR/Dialect.h"
- #include "mlir/Interfaces/ControlFlowInterfaces.h"
--
- #include "triton/Dialect/Triton/IR/Dialect.h.inc"
- #include "triton/Dialect/Triton/IR/OpsEnums.h.inc"
- #include "triton/Dialect/Triton/IR/Traits.h"
-diff --git a/include/triton/Dialect/Triton/IR/TritonDialect.td b/include/triton/Dialect/Triton/IR/TritonDialect.td
-index 07b069e14f..d98ce73884 100644
---- a/include/triton/Dialect/Triton/IR/TritonDialect.td
-+++ b/include/triton/Dialect/Triton/IR/TritonDialect.td
-@@ -25,12 +25,9 @@ def Triton_Dialect : Dialect {
-   let dependentDialects = [
-     "arith::ArithmeticDialect",
-     "math::MathDialect",
--    "StandardOpsDialect",
-     "scf::SCFDialect",
--
--    // Since LLVM 15
--    // "cf::ControlFlowDialect",
--    // "func::FuncDialect"
-+    "cf::ControlFlowDialect",
-+    "func::FuncDialect"
-   ];
- 
-   let extraClassDeclaration = [{
-@@ -38,6 +35,7 @@ def Triton_Dialect : Dialect {
-   }];
- 
-   let hasConstantMaterializer = 1;
-+  let useDefaultTypePrinterParser = 1;
- }
- 
- include "triton/Dialect/Triton/IR/TritonTypes.td"
-diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
-index 779e0b648c..0a69211179 100644
---- a/include/triton/Dialect/Triton/IR/TritonOps.td
-+++ b/include/triton/Dialect/Triton/IR/TritonOps.td
-@@ -141,11 +141,7 @@ def TT_LoadOp : TT_Op<"load",
-                        "triton::EvictionPolicy":$evict, "bool":$isVolatile)>,
-     ];
- 
--    // let assemblyFormat = "operands attr-dict `:` type($result)";
--    let parser = [{ return mlir::triton::parseLoadOp(parser, result); }];
--
--    let printer = [{ return mlir::triton::printLoadOp(p, *this); }];
--
-+    let hasCustomAssemblyFormat = 1;
-     let hasCanonicalizer = 1;
- }
- 
-@@ -170,11 +166,7 @@ def TT_StoreOp : TT_Op<"store",
-                        "triton::EvictionPolicy":$evict)>,
-     ];
- 
--    // let assemblyFormat = "operands attr-dict `:` type($value)";
--    let parser = [{ return mlir::triton::parseStoreOp(parser, result); }];
--
--    let printer = [{ return mlir::triton::printStoreOp(p, *this); }];
--
-+    let hasCustomAssemblyFormat = 1;
-     let hasCanonicalizer = 1;
- }
- 
-diff --git a/include/triton/Dialect/Triton/IR/TritonTypes.td b/include/triton/Dialect/Triton/IR/TritonTypes.td
-index 66d2a7b9a9..2fe2fd077d 100644
---- a/include/triton/Dialect/Triton/IR/TritonTypes.td
-+++ b/include/triton/Dialect/Triton/IR/TritonTypes.td
-@@ -1,6 +1,7 @@
- #ifndef TRITON_TYPES
- #define TRITON_TYPES
- 
-+include "mlir/IR/AttrTypeBase.td"
- include "triton/Dialect/Triton/IR/TritonDialect.td"
- 
- //
-@@ -58,6 +59,7 @@ def TT_Ptr : TritonTypeDef<"Pointer", "ptr"> {
-         }]>
-     ];
- 
-+    let hasCustomAssemblyFormat = 1;
-     let skipDefaultBuilders = 1;
- }
- def TT_PtrTensor : TensorOf<[TT_Ptr]>;
-diff --git a/include/triton/Dialect/Triton/Transforms/Passes.td b/include/triton/Dialect/Triton/Transforms/Passes.td
-index 8f77aed774..a25cdc5680 100644
---- a/include/triton/Dialect/Triton/Transforms/Passes.td
-+++ b/include/triton/Dialect/Triton/Transforms/Passes.td
-@@ -16,8 +16,7 @@ def TritonCombineOps : Pass</*cli-arg*/"triton-combine", /*Op*/"mlir::ModuleOp">
- 
-   let constructor = "mlir::triton::createCombineOpsPass()";
- 
--  let dependentDialects = ["mlir::arith::ArithmeticDialect",
--                           /*SelectOp*/"mlir::StandardOpsDialect"];
-+  let dependentDialects = ["mlir::arith::ArithmeticDialect"];
- }
- 
- #endif
-diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
-index b4c8daec7b..dfc5f53ab1 100644
---- a/include/triton/Dialect/TritonGPU/IR/Dialect.h
-+++ b/include/triton/Dialect/TritonGPU/IR/Dialect.h
-@@ -1,19 +1,17 @@
- #ifndef TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_
- #define TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_
- 
--#include "mlir/Dialect/GPU/GPUDialect.h"
-+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
- #include "mlir/Dialect/Tensor/IR/Tensor.h"
- #include "mlir/IR/BuiltinOps.h"
- #include "mlir/IR/Dialect.h"
- 
- // TritonGPU depends on Triton
- #include "triton/Dialect/Triton/IR/Dialect.h"
--
- #include "triton/Dialect/TritonGPU/IR/Dialect.h.inc"
- #include "triton/Dialect/TritonGPU/IR/Traits.h"
- 
- #define GET_ATTRDEF_CLASSES
--#include "triton/Dialect/Triton/IR/AttrInterfaces.h.inc"
- #include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.h.inc"
- 
- #define GET_OP_CLASSES
-diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
-index 0242c3cc17..af2aeb03a8 100644
---- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
-+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
-@@ -1,6 +1,7 @@
- #ifndef TRITONGPU_ATTRDEFS
- #define TRITONGPU_ATTRDEFS
- 
-+include "mlir/IR/AttrTypeBase.td"
- include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td"
- include "triton/Dialect/Triton/IR/TritonInterfaces.td"
- 
-@@ -136,6 +137,7 @@ A_{3, 2}  A_{3, 3}  A_{3, 0}  A_{3, 1} ...   [phase 1] /
-   ];
- 
-   let extraClassDeclaration = extraBaseClassDeclaration;
-+  let hasCustomAssemblyFormat = 1;
- }
- 
- //===----------------------------------------------------------------------===//
-@@ -273,6 +275,7 @@ for
-     // ArrayRefParameter<"unsigned">:$sizePerCTA
-   );
- 
-+  let hasCustomAssemblyFormat = 1;
- }
- 
- //===----------------------------------------------------------------------===//
-@@ -422,6 +425,7 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
-     static constexpr int numBitsToHoldMmaV1ID{5};
-   }];
- 
-+  let hasCustomAssemblyFormat = 1;
- }
- 
- def SliceEncodingAttr : DistributedEncoding<"SliceEncoding"> {
-@@ -456,6 +460,8 @@ def SliceEncodingAttr : DistributedEncoding<"SliceEncoding"> {
-     template<class T>
-     SmallVector<T> paddedShape(ArrayRef<T> shape) const;
-   }];
-+
-+  let hasCustomAssemblyFormat = 1;
- }
- 
- def DotOperandEncodingAttr : DistributedEncoding<"DotOperandEncoding"> {
-@@ -492,6 +498,7 @@ section 9.7.13.4.1 for more details.
- 
-   ];
- 
-+  let hasCustomAssemblyFormat = 1;
-   let extraClassDeclaration = extraBaseClassDeclaration;
- }
- 
-diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
-index 87ec1d36c6..6489a721b4 100644
---- a/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
-+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
-@@ -30,7 +30,7 @@ def TritonGPU_Dialect : Dialect {
-     }
-   }];
-   
--
-+  let useDefaultAttributePrinterParser = 1;
- }
- 
- #endif
-diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
-index 510f8d0183..7aba11dc75 100644
---- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
-+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
-@@ -59,7 +59,7 @@ def TTG_AsyncCommitGroupOp : TTG_Op<"async_commit_group"> {
- // This is needed because these ops don't
- // handle encodings
- // e.g., https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td#L111
--def TTG_CmpIOp : TTG_Op<"cmpi", [NoSideEffect, Elementwise, 
-+def TTG_CmpIOp : TTG_Op<"cmpi", [NoSideEffect, Elementwise,
-                                  SameOperandsAndResultShape, 
-                                  SameOperandsAndResultEncoding]> {
-   let summary = "integer comparison operation";
-@@ -73,7 +73,7 @@ def TTG_CmpIOp : TTG_Op<"cmpi", [NoSideEffect, Elementwise,
-   let results = (outs TT_BoolLike:$result);
- }
- 
--def TTG_CmpFOp : TTG_Op<"cmpf", [NoSideEffect, Elementwise, 
-+def TTG_CmpFOp : TTG_Op<"cmpf", [NoSideEffect, Elementwise,
-                                  SameOperandsAndResultShape, 
-                                  SameOperandsAndResultEncoding]> {
-   let summary = "floating-point comparison operation";
-@@ -88,8 +88,8 @@ def TTG_CmpFOp : TTG_Op<"cmpf", [NoSideEffect, Elementwise,
- }
- 
- // TODO: migrate to arith::SelectOp on LLVM16
--def TTG_SelectOp : TTG_Op<"select", [NoSideEffect, Elementwise, 
--                                     SameOperandsAndResultShape, 
-+def TTG_SelectOp : TTG_Op<"select", [NoSideEffect, Elementwise,
-+                                     SameOperandsAndResultShape,
-                                      SameOperandsAndResultEncoding]> {
-   let summary = "select operation";
- 
-@@ -188,10 +188,7 @@ def TTG_InsertSliceAsyncOp : TTG_Op<"insert_slice_async",
-     }
-   }];
- 
--  // The custom parser could be replaced with oilist in LLVM-16
--  let parser = [{ return parseInsertSliceAsyncOp(parser, result); }];
--
--  let printer = [{ return printInsertSliceAsyncOp(p, *this); }];
-+  let hasCustomAssemblyFormat = 1;
- }
- 
- def TTG_AllocTensorOp : TTG_Op<"alloc_tensor", [MemoryEffects<[MemAlloc]>,  // Allocate shared memory
-diff --git a/lib/Analysis/Alias.cpp b/lib/Analysis/Alias.cpp
-index a39e4de9aa..208fdd4afc 100644
---- a/lib/Analysis/Alias.cpp
-+++ b/lib/Analysis/Alias.cpp
-@@ -18,8 +18,9 @@ AliasInfo AliasInfo::join(const AliasInfo &lhs, const AliasInfo &rhs) {
-   return ret;
- }
- 
--ChangeResult SharedMemoryAliasAnalysis::visitOperation(
--    Operation *op, ArrayRef<LatticeElement<AliasInfo> *> operands) {
-+void SharedMemoryAliasAnalysis::visitOperation(
-+    Operation *op, ArrayRef<const dataflow::Lattice<AliasInfo> *> operands,
-+    ArrayRef<dataflow::Lattice<AliasInfo> *> results) {
-   AliasInfo aliasInfo;
-   bool pessimistic = true;
-   if (maybeSharedAllocationOp(op)) {
-@@ -44,14 +45,11 @@ ChangeResult SharedMemoryAliasAnalysis::visitOperation(
-   }
- 
-   if (pessimistic) {
--    return markAllPessimisticFixpoint(op->getResults());
-+    return markAllPessimisticFixpoint(results);
-   }
-   // Join all lattice elements
--  ChangeResult result = ChangeResult::NoChange;
--  for (Value value : op->getResults()) {
--    result |= getLatticeElement(value).join(aliasInfo);
--  }
--  return result;
-+  for (auto *result : results)
-+    propagateIfChanged(result, result->join(aliasInfo));
- }
- 
- AliasResult SharedMemoryAliasAnalysis::alias(Value lhs, Value rhs) {
-diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
-index 712c08c475..b4de8dcd9d 100644
---- a/lib/Analysis/Allocation.cpp
-+++ b/lib/Analysis/Allocation.cpp
-@@ -1,4 +1,5 @@
- #include "triton/Analysis/Allocation.h"
-+#include "mlir/Analysis/DataFlowFramework.h"
- #include "mlir/Analysis/Liveness.h"
- #include "mlir/Analysis/SliceAnalysis.h"
- #include "mlir/Dialect/Tensor/IR/Tensor.h"
-@@ -33,10 +34,8 @@ constexpr int kPtrBitWidth = 64;
- 
- static std::pair<SmallVector<unsigned>, SmallVector<unsigned>>
- getCvtOrder(const Attribute &srcLayout, const Attribute &dstLayout) {
--  auto srcBlockedLayout = srcLayout.dyn_cast<BlockedEncodingAttr>();
-   auto srcMmaLayout = srcLayout.dyn_cast<MmaEncodingAttr>();
-   auto srcDotLayout = srcLayout.dyn_cast<DotOperandEncodingAttr>();
--  auto dstBlockedLayout = dstLayout.dyn_cast<BlockedEncodingAttr>();
-   auto dstMmaLayout = dstLayout.dyn_cast<MmaEncodingAttr>();
-   auto dstDotLayout = dstLayout.dyn_cast<DotOperandEncodingAttr>();
-   assert(!(srcMmaLayout && dstMmaLayout) &&
-@@ -224,14 +223,12 @@ class AllocationAnalysis {
-   }
- 
-   void getValueAlias(Value value, SharedMemoryAliasAnalysis &analysis) {
--    LatticeElement<AliasInfo> *latticeElement =
--        analysis.lookupLatticeElement(value);
--    if (latticeElement) {
--      auto &info = latticeElement->getValue();
--      if (!info.getAllocs().empty()) {
--        for (auto alloc : info.getAllocs()) {
--          allocation->addAlias(value, alloc);
--        }
-+    dataflow::Lattice<AliasInfo> *latticeElement =
-+        analysis.getLatticeElement(value);
-+    if (latticeElement && !latticeElement->isUninitialized()) {
-+      AliasInfo &info = latticeElement->getValue();
-+      for (auto alloc : info.getAllocs()) {
-+        allocation->addAlias(value, alloc);
-       }
-     }
-   }
-@@ -244,14 +241,19 @@ class AllocationAnalysis {
-       getScratchValueSize(op);
-     });
-     // Get the alias values
--    SharedMemoryAliasAnalysis aliasAnalysis(operation->getContext());
--    aliasAnalysis.run(operation);
-+    std::unique_ptr<DataFlowSolver> solver = createDataFlowSolver();
-+    SharedMemoryAliasAnalysis *aliasAnalysis =
-+        solver->load<SharedMemoryAliasAnalysis>();
-+    if (failed(solver->initializeAndRun(operation))) {
-+      // TODO: return error instead of bailing out..
-+      llvm_unreachable("failed to run SharedMemoryAliasAnalysis");
-+    }
-     operation->walk<WalkOrder::PreOrder>([&](Operation *op) {
-       for (auto operand : op->getOperands()) {
--        getValueAlias(operand, aliasAnalysis);
-+        getValueAlias(operand, *aliasAnalysis);
-       }
-       for (auto value : op->getResults()) {
--        getValueAlias(value, aliasAnalysis);
-+        getValueAlias(value, *aliasAnalysis);
-       }
-     });
-   }
-diff --git a/lib/Analysis/AxisInfo.cpp b/lib/Analysis/AxisInfo.cpp
-index 0b7142b04d..4af46c3fbb 100644
---- a/lib/Analysis/AxisInfo.cpp
-+++ b/lib/Analysis/AxisInfo.cpp
-@@ -1,4 +1,4 @@
--#include "mlir/Analysis/DataFlowAnalysis.h"
-+#include "mlir/Analysis/DataFlowFramework.h"
- #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
- #include "llvm/Support/raw_ostream.h"
- 
-@@ -52,7 +52,7 @@ AxisInfo AxisInfo::getPessimisticValueState(Value value) {
-   BlockArgument blockArg = value.dyn_cast<BlockArgument>();
-   if (blockArg && blockArg.getOwner()->isEntryBlock()) {
-     Operation *op = blockArg.getOwner()->getParentOp();
--    if (FuncOp fun = dyn_cast<FuncOp>(op)) {
-+    if (func::FuncOp fun = dyn_cast<func::FuncOp>(op)) {
-       Attribute attr =
-           fun.getArgAttr(blockArg.getArgNumber(), "tt.divisibility");
-       if (attr)
-@@ -136,8 +136,9 @@ class CastOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
- public:
-   using AxisInfoVisitorImpl<OpTy>::AxisInfoVisitorImpl;
- 
--  AxisInfo getAxisInfo(OpTy op,
--                       ArrayRef<LatticeElement<AxisInfo> *> operands) override {
-+  AxisInfo
-+  getAxisInfo(OpTy op,
-+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
-     return operands[0]->getValue();
-   }
- };
-@@ -147,8 +148,9 @@ class MakeRangeOpAxisInfoVisitor final
- public:
-   using AxisInfoVisitorImpl<triton::MakeRangeOp>::AxisInfoVisitorImpl;
- 
--  AxisInfo getAxisInfo(triton::MakeRangeOp op,
--                       ArrayRef<LatticeElement<AxisInfo> *> operands) override {
-+  AxisInfo
-+  getAxisInfo(triton::MakeRangeOp op,
-+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
-     auto start = op.start();
-     auto end = op.end();
-     return AxisInfo(/*contiguity=*/{end - start},
-@@ -162,8 +164,9 @@ class ConstantOpAxisInfoVisitor final
- public:
-   using AxisInfoVisitorImpl<arith::ConstantOp>::AxisInfoVisitorImpl;
- 
--  AxisInfo getAxisInfo(arith::ConstantOp op,
--                       ArrayRef<LatticeElement<AxisInfo> *> operands) override {
-+  AxisInfo
-+  getAxisInfo(arith::ConstantOp op,
-+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
-     auto intAttr = op.getValue().dyn_cast<IntegerAttr>();
-     auto boolAttr = op.getValue().dyn_cast<BoolAttr>();
-     if (intAttr || boolAttr) {
-@@ -416,8 +419,9 @@ class SplatOpAxisInfoVisitor final
- public:
-   using AxisInfoVisitorImpl<triton::SplatOp>::AxisInfoVisitorImpl;
- 
--  AxisInfo getAxisInfo(triton::SplatOp op,
--                       ArrayRef<LatticeElement<AxisInfo> *> operands) override {
-+  AxisInfo
-+  getAxisInfo(triton::SplatOp op,
-+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
-     Type _retTy = *op->result_type_begin();
-     TensorType retTy = _retTy.cast<TensorType>();
-     AxisInfo opInfo = operands[0]->getValue();
-@@ -439,8 +443,9 @@ class ExpandDimsOpAxisInfoVisitor final
- public:
-   using AxisInfoVisitorImpl<triton::ExpandDimsOp>::AxisInfoVisitorImpl;
- 
--  AxisInfo getAxisInfo(triton::ExpandDimsOp op,
--                       ArrayRef<LatticeElement<AxisInfo> *> operands) override {
-+  AxisInfo
-+  getAxisInfo(triton::ExpandDimsOp op,
-+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
-     AxisInfo opInfo = operands[0]->getValue();
-     AxisInfo::DimVectorT contiguity = opInfo.getContiguity();
-     AxisInfo::DimVectorT divisibility = opInfo.getDivisibility();
-@@ -458,8 +463,9 @@ class BroadcastOpAxisInfoVisitor final
- public:
-   using AxisInfoVisitorImpl<triton::BroadcastOp>::AxisInfoVisitorImpl;
- 
--  AxisInfo getAxisInfo(triton::BroadcastOp op,
--                       ArrayRef<LatticeElement<AxisInfo> *> operands) override {
-+  AxisInfo
-+  getAxisInfo(triton::BroadcastOp op,
-+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
-     Type _retTy = *op->result_type_begin();
-     Type _opTy = *op->operand_type_begin();
-     TensorType retTy = _retTy.cast<TensorType>();
-@@ -486,8 +492,9 @@ class CmpOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
- public:
-   using AxisInfoVisitorImpl<OpTy>::AxisInfoVisitorImpl;
- 
--  AxisInfo getAxisInfo(OpTy op,
--                       ArrayRef<LatticeElement<AxisInfo> *> operands) override {
-+  AxisInfo
-+  getAxisInfo(OpTy op,
-+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
-     auto resTy = op.getResult().getType().template dyn_cast<RankedTensorType>();
-     if (!resTy)
-       return AxisInfo();
-@@ -596,8 +603,9 @@ class SelectOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
- public:
-   using AxisInfoVisitorImpl<OpTy>::AxisInfoVisitorImpl;
- 
--  AxisInfo getAxisInfo(OpTy op,
--                       ArrayRef<LatticeElement<AxisInfo> *> operands) override {
-+  AxisInfo
-+  getAxisInfo(OpTy op,
-+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
-     auto resTy = op.getResult().getType().template dyn_cast<RankedTensorType>();
-     if (!resTy)
-       return AxisInfo();
-@@ -757,8 +765,9 @@ class MaxMinOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
- public:
-   using AxisInfoVisitorImpl<OpTy>::AxisInfoVisitorImpl;
- 
--  AxisInfo getAxisInfo(OpTy op,
--                       ArrayRef<LatticeElement<AxisInfo> *> operands) override {
-+  AxisInfo
-+  getAxisInfo(OpTy op,
-+              ArrayRef<const dataflow::Lattice<AxisInfo> *> operands) override {
-     auto lhsInfo = operands[0]->getValue();
-     auto rhsInfo = operands[1]->getValue();
-     std::optional<int64_t> constantValue;
-@@ -786,8 +795,8 @@ class MaxMinOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
- // AxisInfoAnalysis
- //===----------------------------------------------------------------------===//
- 
--AxisInfoAnalysis::AxisInfoAnalysis(MLIRContext *context)
--    : ForwardDataFlowAnalysis<AxisInfo>(context) {
-+AxisInfoAnalysis::AxisInfoAnalysis(DataFlowSolver &solver)
-+    : dataflow::SparseDataFlowAnalysis<dataflow::Lattice<AxisInfo>>(solver) {
-   // UnrealizedConversionCast:
-   // This is needed by TritonGPUToLLVM, to get AxisInfo when the graph is
-   // in the process of a PartialConversion, where UnrealizedConversionCast
-@@ -819,7 +828,7 @@ AxisInfoAnalysis::AxisInfoAnalysis(MLIRContext *context)
-   visitors.append<LogicalOpAxisInfoVisitor<arith::AndIOp>,
-                   LogicalOpAxisInfoVisitor<arith::OrIOp>,
-                   LogicalOpAxisInfoVisitor<arith::XOrIOp>>();
--  visitors.append<SelectOpAxisInfoVisitor<mlir::SelectOp>,
-+  visitors.append<SelectOpAxisInfoVisitor<mlir::arith::SelectOp>,
-                   SelectOpAxisInfoVisitor<triton::gpu::SelectOp>>();
-   visitors.append<ShLIOpAxisInfoVisitor, ShROpAxisInfoVisitor<arith::ShRUIOp>,
-                   ShROpAxisInfoVisitor<arith::ShRSIOp>>();
-@@ -829,11 +838,12 @@ AxisInfoAnalysis::AxisInfoAnalysis(MLIRContext *context)
-                   MaxMinOpAxisInfoVisitor<arith::MinUIOp>>();
- }
- 
--ChangeResult AxisInfoAnalysis::visitOperation(
--    Operation *op, ArrayRef<LatticeElement<AxisInfo> *> operands) {
-+void AxisInfoAnalysis::visitOperation(
-+    Operation *op, ArrayRef<const dataflow::Lattice<AxisInfo> *> operands,
-+    ArrayRef<dataflow::Lattice<AxisInfo> *> results) {
-   AxisInfo curr = visitors.apply(op, operands);
-   if (curr.getRank() == 0) {
--    return markAllPessimisticFixpoint(op->getResults());
-+    return markAllPessimisticFixpoint(results);
-   }
-   // override with hint
-   auto newContiguity = curr.getContiguity();
-@@ -854,11 +864,8 @@ ChangeResult AxisInfoAnalysis::visitOperation(
-   curr = mlir::AxisInfo(newContiguity, newDivisibility, newConstancy,
-                         curr.getConstantValue());
-   // join all lattice elements
--  ChangeResult result = ChangeResult::NoChange;
--  for (Value value : op->getResults()) {
--    result |= getLatticeElement(value).join(curr);
--  }
--  return result;
-+  for (auto *result : results)
-+    propagateIfChanged(result, result->join(curr));
- }
- 
- unsigned AxisInfoAnalysis::getPtrContiguity(Value ptr) {
-@@ -884,7 +891,10 @@ unsigned AxisInfoAnalysis::getPtrAlignment(Value ptr) {
-   auto tensorTy = ptr.getType().dyn_cast<RankedTensorType>();
-   if (!tensorTy)
-     return 1;
--  auto axisInfo = lookupLatticeElement(ptr)->getValue();
-+  dataflow::Lattice<AxisInfo> *latticeElement = getLatticeElement(ptr);
-+  if (!latticeElement || latticeElement->isUninitialized())
-+    return 1;
-+  auto axisInfo = latticeElement->getValue();
-   auto layout = tensorTy.getEncoding();
-   auto order = triton::gpu::getOrder(layout);
-   auto maxMultipleBytes = axisInfo.getDivisibility(order[0]);
-@@ -900,8 +910,11 @@ unsigned AxisInfoAnalysis::getMaskAlignment(Value mask) {
-   auto tensorTy = mask.getType().dyn_cast<RankedTensorType>();
-   if (!tensorTy)
-     return 1;
-+  dataflow::Lattice<AxisInfo> *latticeElement = getLatticeElement(mask);
-+  if (!latticeElement || latticeElement->isUninitialized())
-+    return 1;
-+  auto maskAxis = latticeElement->getValue();
-   auto maskOrder = triton::gpu::getOrder(tensorTy.getEncoding());
--  auto maskAxis = lookupLatticeElement(mask)->getValue();
-   auto alignment = std::max<unsigned>(maskAxis.getConstancy(maskOrder[0]), 1);
-   return alignment;
- }
-diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
-index afbc692510..1f761f845c 100644
---- a/lib/Analysis/CMakeLists.txt
-+++ b/lib/Analysis/CMakeLists.txt
-@@ -8,7 +8,7 @@ add_mlir_library(TritonAnalysis
-   DEPENDS
-   TritonTableGen
-   TritonGPUAttrDefsIncGen
--  
-+
-   LINK_LIBS PUBLIC
-   MLIRAnalysis
- )
-diff --git a/lib/Analysis/Membar.cpp b/lib/Analysis/Membar.cpp
-index acc885e827..910274b2ac 100644
---- a/lib/Analysis/Membar.cpp
-+++ b/lib/Analysis/Membar.cpp
-@@ -2,7 +2,7 @@
- #include "triton/Analysis/Alias.h"
- #include "triton/Dialect/TritonGPU/IR/Dialect.h"
- 
--#include "mlir/Dialect/GPU/GPUDialect.h"
-+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
- #include "mlir/Dialect/Tensor/IR/Tensor.h"
- 
- namespace mlir {
-diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
-index d9e917e731..6ea52df272 100644
---- a/lib/Analysis/Utility.cpp
-+++ b/lib/Analysis/Utility.cpp
-@@ -1,5 +1,8 @@
- #include "triton/Analysis/Utility.h"
-+#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
-+#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
- #include "mlir/IR/Dialect.h"
-+#include "mlir/IR/Matchers.h"
- #include "triton/Dialect/Triton/IR/Dialect.h"
- #include "triton/Dialect/TritonGPU/IR/Dialect.h"
- #include <deque>
-@@ -325,4 +328,55 @@ SetVector<Operation *> multiRootGetSlice(Operation *op,
-   return multiRootTopologicalSort(slice);
- }
- 
-+namespace {
-+// Copied from TestDeadCodeAnalysis.cpp, because some dead code analysis
-+// interacts with constant propagation, but SparseConstantPropagation
-+// doesn't seem to be sufficient.
-+struct ConstantAnalysis : public DataFlowAnalysis {
-+  using DataFlowAnalysis::DataFlowAnalysis;
-+
-+  LogicalResult initialize(Operation *top) override {
-+    WalkResult result = top->walk([&](Operation *op) {
-+      if (failed(visit(op)))
-+        return WalkResult::interrupt();
-+      return WalkResult::advance();
-+    });
-+    return success(!result.wasInterrupted());
-+  }
-+
-+  LogicalResult visit(ProgramPoint point) override {
-+    Operation *op = point.get<Operation *>();
-+    Attribute value;
-+    if (matchPattern(op, m_Constant(&value))) {
-+      auto *constant = getOrCreate<dataflow::Lattice<dataflow::ConstantValue>>(
-+          op->getResult(0));
-+      propagateIfChanged(constant, constant->join(dataflow::ConstantValue(
-+                                       value, op->getDialect())));
-+      return success();
-+    }
-+    setAllToUnknownConstants(op->getResults());
-+    for (Region &region : op->getRegions())
-+      setAllToUnknownConstants(region.getArguments());
-+    return success();
-+  }
-+
-+  /// Set all given values as not constants.
-+  void setAllToUnknownConstants(ValueRange values) {
-+    dataflow::ConstantValue unknownConstant(nullptr, nullptr);
-+    for (Value value : values) {
-+      auto *constant =
-+          getOrCreate<dataflow::Lattice<dataflow::ConstantValue>>(value);
-+      propagateIfChanged(constant, constant->join(unknownConstant));
-+    }
-+  }
-+};
-+} // namespace
-+
-+std::unique_ptr<DataFlowSolver> createDataFlowSolver() {
-+  auto solver = std::make_unique<DataFlowSolver>();
-+  solver->load<dataflow::DeadCodeAnalysis>();
-+  solver->load<ConstantAnalysis>();
-+  return solver;
-+}
-+
- } // namespace mlir
-diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
-index 6a46265bd7..e352eb3698 100644
---- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
-@@ -159,9 +159,6 @@ struct ConvertLayoutOpConversion
-                       Value smemBase) const {
-     auto accumNumCTAsEachRep = product<unsigned>(numCTAsEachRep);
-     auto layout = type.getEncoding();
--    auto blockedLayout = layout.dyn_cast<BlockedEncodingAttr>();
--    auto sliceLayout = layout.dyn_cast<SliceEncodingAttr>();
--    auto mmaLayout = layout.dyn_cast<MmaEncodingAttr>();
-     auto rank = type.getRank();
-     auto sizePerThread = getSizePerThread(layout);
-     auto accumSizePerThread = product<unsigned>(sizePerThread);
-diff --git a/lib/Conversion/TritonGPUToLLVM/DotOpHelpers.h b/lib/Conversion/TritonGPUToLLVM/DotOpHelpers.h
-index 4b89965aa9..1d9e00519b 100644
---- a/lib/Conversion/TritonGPUToLLVM/DotOpHelpers.h
-+++ b/lib/Conversion/TritonGPUToLLVM/DotOpHelpers.h
-@@ -7,10 +7,8 @@
- #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
- #include "mlir/Conversion/LLVMCommon/Pattern.h"
- #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
--#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
--#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
- #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
--#include "mlir/Dialect/GPU/GPUDialect.h"
-+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
- #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
- #include "mlir/Dialect/Tensor/IR/Tensor.h"
- #include "mlir/IR/Matchers.h"
-@@ -422,9 +420,9 @@ struct MMA16816ConversionHelper {
-   MMA16816ConversionHelper(Type dotOperand, MmaEncodingAttr mmaLayout,
-                            Value thread, ConversionPatternRewriter &rewriter,
-                            TypeConverter *typeConverter, Location loc)
--      : mmaLayout(mmaLayout), thread(thread), helper(mmaLayout),
--        rewriter(rewriter), typeConverter(typeConverter), loc(loc),
--        ctx(mmaLayout.getContext()), wpt(mmaLayout.getWarpsPerCTA()) {
-+      : mmaLayout(mmaLayout), wpt(mmaLayout.getWarpsPerCTA()), thread(thread),
-+        helper(mmaLayout), rewriter(rewriter), typeConverter(typeConverter),
-+        loc(loc), ctx(mmaLayout.getContext()) {
-     helper.deduceMmaType(dotOperand);
- 
-     Value _32 = i32_val(32);
-diff --git a/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM.cpp
-index 0f8070ca9f..e4bd47c411 100644
---- a/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM.cpp
-@@ -115,8 +115,6 @@ struct DotOpConversion : public ConvertTritonGPUOpToLLVMPattern<triton::DotOp> {
-     auto DTensorTy = D.getType().cast<RankedTensorType>();
-     auto AShape = ATensorTy.getShape();
-     auto BShape = BTensorTy.getShape();
--    auto DShape = DTensorTy.getShape();
--    auto wpt = mmaLayout.getWarpsPerCTA();
- 
-     bool isARow = ALayout.getIsMMAv1Row().cast<BoolAttr>().getValue();
-     bool isBRow = BLayout.getIsMMAv1Row().cast<BoolAttr>().getValue();
-@@ -221,7 +219,6 @@ struct DotOpConversion : public ConvertTritonGPUOpToLLVMPattern<triton::DotOp> {
-                               ConversionPatternRewriter &rewriter) const {
-     auto *ctx = rewriter.getContext();
-     auto loc = op.getLoc();
--    auto threadId = getThreadId(rewriter, loc);
- 
-     auto A = op.a();
-     auto B = op.b();
-@@ -230,12 +227,10 @@ struct DotOpConversion : public ConvertTritonGPUOpToLLVMPattern<triton::DotOp> {
- 
-     auto aTensorTy = A.getType().cast<RankedTensorType>();
-     auto bTensorTy = B.getType().cast<RankedTensorType>();
--    auto cTensorTy = C.getType().cast<RankedTensorType>();
-     auto dTensorTy = D.getType().cast<RankedTensorType>();
- 
-     auto aShape = aTensorTy.getShape();
-     auto bShape = bTensorTy.getShape();
--    auto cShape = cTensorTy.getShape();
- 
-     BlockedEncodingAttr dLayout =
-         dTensorTy.getEncoding().cast<BlockedEncodingAttr>();
-diff --git a/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
-index deb71b9597..0b9e67674b 100644
---- a/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp
-@@ -61,7 +61,6 @@ struct FpToFpOpConversion
-   convertFp16x4ToFp8x4(Location loc, ConversionPatternRewriter &rewriter,
-                        const Value &v0, const Value &v1, const Value &v2,
-                        const Value &v3) {
--    auto ctx = rewriter.getContext();
-     auto fp16x2VecTy = vec_ty(f16_ty, 2);
-     Value fp16x2Vec0 = undef(fp16x2VecTy);
-     Value fp16x2Vec1 = undef(fp16x2VecTy);
-@@ -153,7 +152,6 @@ struct FpToFpOpConversion
-   convertBf16x4ToFp8x4(Location loc, ConversionPatternRewriter &rewriter,
-                        const Value &v0, const Value &v1, const Value &v2,
-                        const Value &v3) {
--    auto ctx = rewriter.getContext();
-     auto bf16x2VecTy = vec_ty(i16_ty, 2);
-     Value bf16x2Vec0 = undef(bf16x2VecTy);
-     Value bf16x2Vec1 = undef(bf16x2VecTy);
-diff --git a/lib/Conversion/TritonGPUToLLVM/LoadStoreOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/LoadStoreOpToLLVM.cpp
-index 9a8b4702bc..bae675f0cb 100644
---- a/lib/Conversion/TritonGPUToLLVM/LoadStoreOpToLLVM.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/LoadStoreOpToLLVM.cpp
-@@ -109,7 +109,8 @@ struct LoadOpConversion
-     DenseElementsAttr constAttr;
-     int64_t splatVal = 0;
-     if (other && valueElemTy.isa<IntegerType>() &&
--        matchPattern(other, m_Constant(&constAttr)) && constAttr.isSplat()) {
-+        matchPattern(other, m_Constant(&constAttr)) && constAttr.isSplat() &&
-+        constAttr.getElementType().isa<IntegerType>()) {
-       otherIsSplatConstInt = true;
-       splatVal = constAttr.getSplatValue<APInt>().getSExtValue();
-     }
-@@ -333,7 +334,6 @@ struct StoreOpConversion
-             elem = rewriter.create<LLVM::SExtOp>(loc, type::i8Ty(ctx), elem);
-           elem = bitcast(elem, valueElemTy);
- 
--          Type u32Ty = typeConverter->convertType(type::u32Ty(ctx));
-           llWord = insert_element(wordTy, llWord, elem, i32_val(elemIdx));
-         }
-         llWord = bitcast(llWord, valArgTy);
-@@ -387,7 +387,6 @@ struct AtomicCASOpConversion
-                   ConversionPatternRewriter &rewriter) const override {
-     auto loc = op.getLoc();
-     MLIRContext *ctx = rewriter.getContext();
--    Value ptr = op.ptr();
- 
-     Value llPtr = adaptor.ptr();
-     Value llCmp = adaptor.cmp();
-diff --git a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
-index 69abd889be..1c973dc196 100644
---- a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
-@@ -286,7 +286,6 @@ struct ReduceOpConversion
-     auto srcTy = op.operand().getType().cast<RankedTensorType>();
-     auto srcLayout = srcTy.getEncoding();
-     auto srcShape = srcTy.getShape();
--    auto srcRank = srcTy.getRank();
-     auto order = getOrder(srcLayout);
- 
-     auto threadsPerWarp = triton::gpu::getThreadsPerWarp(srcLayout);
-@@ -351,7 +350,6 @@ struct ReduceOpConversion
- 
-     Value zero = i32_val(0);
-     Value laneZero = icmp_eq(laneIdAxis, zero);
--    Value warpZero = icmp_eq(warpIdAxis, zero);
- 
-     for (auto it : accs) {
-       const SmallVector<unsigned> &key = it.first;
-diff --git a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp
-index 5b77150b1a..78cfa076bd 100644
---- a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp
-@@ -11,11 +11,11 @@ using ::mlir::LLVM::getStructFromElements;
- using ::mlir::triton::gpu::getElemsPerThread;
- using ::mlir::triton::gpu::SharedEncodingAttr;
- 
--struct ReturnOpConversion : public ConvertOpToLLVMPattern<::mlir::ReturnOp> {
--  using ConvertOpToLLVMPattern<ReturnOp>::ConvertOpToLLVMPattern;
-+struct ReturnOpConversion : public ConvertOpToLLVMPattern<func::ReturnOp> {
-+  using ConvertOpToLLVMPattern<func::ReturnOp>::ConvertOpToLLVMPattern;
- 
-   LogicalResult
--  matchAndRewrite(ReturnOp op, OpAdaptor adaptor,
-+  matchAndRewrite(func::ReturnOp op, OpAdaptor adaptor,
-                   ConversionPatternRewriter &rewriter) const override {
-     unsigned numArguments = op.getNumOperands();
- 
-@@ -476,7 +476,6 @@ struct ExtractSliceOpConversion
- 
-     auto llvmElemTy = getTypeConverter()->convertType(srcTy.getElementType());
-     auto elemPtrTy = ptr_ty(llvmElemTy, 3);
--    auto resTy = op.getType().dyn_cast<RankedTensorType>();
-     smemObj = SharedMemoryObject(gep(elemPtrTy, smemObj.base, offset),
-                                  strideVals, offsetVals);
-     auto retVal = getStructFromSharedMemoryObject(loc, smemObj, rewriter);
-diff --git a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMBase.h b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMBase.h
-index bb10d5b24a..00e399f848 100644
---- a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMBase.h
-+++ b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMBase.h
-@@ -4,6 +4,7 @@
- // TODO: refactor so that it doesn't fail if Allocation.h
- // is included after utility.h (due to conflict in `store` macro
- // and <atomic>
-+#include "mlir/Dialect/Func/IR/FuncOps.h"
- #include "triton/Analysis/Allocation.h"
- 
- //
-@@ -39,15 +40,15 @@ void vprintf_array(Value thread, ArrayRef<Value> arr, std::string info,
- // TODO(Superjomn): remove the code when MLIR v15.0 is included.
- // All the rights are reserved by the LLVM community.
- 
--struct FuncOpConversionBase : public ConvertOpToLLVMPattern<FuncOp> {
-+struct FuncOpConversionBase : public ConvertOpToLLVMPattern<func::FuncOp> {
- private:
-   /// Only retain those attributes that are not constructed by
-   /// `LLVMFuncOp::build`. If `filterArgAttrs` is set, also filter out argument
-   /// attributes.
--  static void filterFuncAttributes(ArrayRef<NamedAttribute> attrs,
--                                   bool filterArgAttrs,
-+  static void filterFuncAttributes(func::FuncOp op, bool filterArgAttrs,
-                                    SmallVectorImpl<NamedAttribute> &result) {
--    for (const auto &attr : attrs) {
-+
-+    for (const auto &attr : op->getAttrs()) {
-       if (attr.getName() == SymbolTable::getSymbolAttrName() ||
-           attr.getName() == FunctionOpInterface::getTypeAttrName() ||
-           attr.getName() == "std.varargs" ||
-@@ -65,27 +66,27 @@ struct FuncOpConversionBase : public ConvertOpToLLVMPattern<FuncOp> {
-   }
- 
- protected:
--  using ConvertOpToLLVMPattern<FuncOp>::ConvertOpToLLVMPattern;
-+  using ConvertOpToLLVMPattern<func::FuncOp>::ConvertOpToLLVMPattern;
- 
-   // Convert input FuncOp to LLVMFuncOp by using the LLVMTypeConverter provided
-   // to this legalization pattern.
-   LLVM::LLVMFuncOp
--  convertFuncOpToLLVMFuncOp(FuncOp funcOp,
-+  convertFuncOpToLLVMFuncOp(func::FuncOp funcOp,
-                             ConversionPatternRewriter &rewriter) const {
-     // Convert the original function arguments. They are converted using the
-     // LLVMTypeConverter provided to this legalization pattern.
-     auto varargsAttr = funcOp->getAttrOfType<BoolAttr>("func.varargs");
-     TypeConverter::SignatureConversion result(funcOp.getNumArguments());
-     auto llvmType = getTypeConverter()->convertFunctionSignature(
--        funcOp.getType(), varargsAttr && varargsAttr.getValue(), result);
-+        funcOp.getFunctionType(), varargsAttr && varargsAttr.getValue(),
-+        result);
-     if (!llvmType)
-       return nullptr;
- 
-     // Propagate argument/result attributes to all converted arguments/result
-     // obtained after converting a given original argument/result.
-     SmallVector<NamedAttribute, 4> attributes;
--    filterFuncAttributes(funcOp->getAttrs(), /*filterArgAttrs=*/true,
--                         attributes);
-+    filterFuncAttributes(funcOp, /*filterArgAttrs=*/true, attributes);
-     if (ArrayAttr resAttrDicts = funcOp.getAllResultAttrs()) {
-       assert(!resAttrDicts.empty() && "expected array to be non-empty");
-       auto newResAttrDicts =
-@@ -131,7 +132,7 @@ struct FuncOpConversionBase : public ConvertOpToLLVMPattern<FuncOp> {
-     }
-     auto newFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
-         funcOp.getLoc(), funcOp.getName(), llvmType, linkage,
--        /*dsoLocal*/ false, attributes);
-+        /*dsoLocal*/ false, LLVM::CConv::C, attributes);
-     rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(),
-                                 newFuncOp.end());
-     if (failed(rewriter.convertRegionTypes(&newFuncOp.getBody(), *typeConverter,
-@@ -191,8 +192,8 @@ class ConvertTritonGPUOpToLLVMPatternBase {
-                                                const Allocation *allocation,
-                                                Value smem,
-                                                IndexCacheInfo indexCacheInfo)
--      : converter(&typeConverter), indexCacheInfo(indexCacheInfo),
--        allocation(allocation), smem(smem) {}
-+      : converter(&typeConverter), allocation(allocation), smem(smem),
-+        indexCacheInfo(indexCacheInfo) {}
- 
-   LLVMTypeConverter *getTypeConverter() const { return converter; }
- 
-@@ -861,7 +862,6 @@ class ConvertTritonGPUOpToLLVMPatternBase {
-                             ArrayRef<int64_t> shape) const {
-     auto parent = sliceLayout.getParent();
-     unsigned dim = sliceLayout.getDim();
--    size_t rank = shape.size();
-     auto parentIndices =
-         emitIndices(loc, rewriter, parent, sliceLayout.paddedShape(shape));
-     unsigned numIndices = parentIndices.size();
-diff --git a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp
-index ff1af09835..6f66af4e34 100644
---- a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp
-+++ b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp
-@@ -1,10 +1,11 @@
- #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h"
- 
-+#include "mlir/Analysis/DataFlowFramework.h"
- #include "mlir/Conversion/ArithmeticToLLVM/ArithmeticToLLVM.h"
-+#include "mlir/Conversion/ControlFlowToLLVM//ControlFlowToLLVM.h"
- #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
- #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
--#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
--#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
-+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
- #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
- #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
- #include "mlir/Pass/Pass.h"
-@@ -40,7 +41,6 @@ class TritonLLVMConversionTarget : public ConversionTarget {
-     addIllegalDialect<triton::TritonDialect>();
-     addIllegalDialect<triton::gpu::TritonGPUDialect>();
-     addIllegalDialect<mlir::gpu::GPUDialect>();
--    addIllegalDialect<mlir::StandardOpsDialect>();
-     addLegalOp<mlir::UnrealizedConversionCastOp>();
-   }
- };
-@@ -51,7 +51,7 @@ class TritonLLVMFunctionConversionTarget : public ConversionTarget {
-       : ConversionTarget(ctx) {
-     addLegalDialect<LLVM::LLVMDialect>();
-     addLegalDialect<NVVM::NVVMDialect>();
--    addIllegalOp<mlir::FuncOp>();
-+    addIllegalOp<mlir::func::FuncOp>();
-     addLegalOp<mlir::UnrealizedConversionCastOp>();
-   }
- };
-@@ -69,7 +69,7 @@ struct FuncOpConversion : public FuncOpConversionBase {
-       : FuncOpConversionBase(converter, benefit), numWarps(numWarps) {}
- 
-   LogicalResult
--  matchAndRewrite(FuncOp funcOp, OpAdaptor adaptor,
-+  matchAndRewrite(func::FuncOp funcOp, OpAdaptor adaptor,
-                   ConversionPatternRewriter &rewriter) const override {
-     auto newFuncOp = convertFuncOpToLLVMFuncOp(funcOp, rewriter);
-     if (!newFuncOp)
-@@ -133,7 +133,8 @@ class ConvertTritonGPUToLLVM
-     decomposeBlockedToDotOperand(mod);
- 
-     // Step 2
--    decomposeInsertSliceAsyncOp(mod);
-+    if (failed(decomposeInsertSliceAsyncOp(mod)))
-+      return signalPassFailure();
- 
-     // Step 3
-     Allocation allocation(mod);
-@@ -142,7 +143,7 @@ class ConvertTritonGPUToLLVM
- 
-     // Step 4
-     RewritePatternSet scf_patterns(context);
--    mlir::populateLoopToStdConversionPatterns(scf_patterns);
-+    mlir::populateSCFToControlFlowConversionPatterns(scf_patterns);
-     mlir::ConversionTarget scf_target(*context);
-     scf_target.addIllegalOp<scf::ForOp, scf::IfOp, scf::ParallelOp,
-                             scf::WhileOp, scf::ExecuteRegionOp>();
-@@ -159,8 +160,10 @@ class ConvertTritonGPUToLLVM
-       return signalPassFailure();
- 
-     // Step 6 - get axis and shared memory info
--    AxisInfoAnalysis axisInfoAnalysis(mod.getContext());
--    axisInfoAnalysis.run(mod);
-+    std::unique_ptr<DataFlowSolver> solver = createDataFlowSolver();
-+    AxisInfoAnalysis *axisInfoAnalysis = solver->load<AxisInfoAnalysis>();
-+    if (failed(solver->initializeAndRun(mod)))
-+      return signalPassFailure();
-     initSharedMemory(allocation.getSharedMemorySize(), typeConverter);
-     mod->setAttr("triton_gpu.shared",
-                  mlir::IntegerAttr::get(mlir::IntegerType::get(context, 32),
-@@ -178,38 +181,39 @@ class ConvertTritonGPUToLLVM
- 
-     // Normal conversions
-     populateTritonGPUToLLVMPatterns(typeConverter, patterns, numWarps,
--                                    axisInfoAnalysis, &allocation, smem,
-+                                    *axisInfoAnalysis, &allocation, smem,
-                                     indexCacheInfo, /*benefit=*/10);
-     // ConvertLayoutOp
-     populateConvertLayoutOpToLLVMPatterns(typeConverter, patterns, numWarps,
--                                          axisInfoAnalysis, &allocation, smem,
-+                                          *axisInfoAnalysis, &allocation, smem,
-                                           indexCacheInfo, /*benefit=*/10);
-     // DotOp
-     populateDotOpToLLVMPatterns(typeConverter, patterns, numWarps,
--                                axisInfoAnalysis, &allocation, smem,
-+                                *axisInfoAnalysis, &allocation, smem,
-                                 /*benefit=*/10);
-     // ElementwiseOp
-     populateElementwiseOpToLLVMPatterns(typeConverter, patterns, numWarps,
--                                        axisInfoAnalysis, &allocation, smem,
-+                                        *axisInfoAnalysis, &allocation, smem,
-                                         /*benefit=*/10);
-     // LoadStoreOp
-     populateLoadStoreOpToLLVMPatterns(typeConverter, patterns, numWarps,
--                                      axisInfoAnalysis, &allocation, smem,
-+                                      *axisInfoAnalysis, &allocation, smem,
-                                       indexCacheInfo, /*benefit=*/10);
-     // ReduceOp
-     populateReduceOpToLLVMPatterns(typeConverter, patterns, numWarps,
--                                   axisInfoAnalysis, &allocation, smem,
-+                                   *axisInfoAnalysis, &allocation, smem,
-                                    indexCacheInfo, /*benefit=*/10);
-     // ViewOp
-     populateViewOpToLLVMPatterns(typeConverter, patterns, numWarps,
--                                 axisInfoAnalysis, &allocation, smem,
-+                                 *axisInfoAnalysis, &allocation, smem,
-                                  /*benefit=*/10);
- 
-     // Add arith/math's patterns to help convert scalar expression to LLVM.
-     mlir::arith::populateArithmeticToLLVMConversionPatterns(typeConverter,
-                                                             patterns);
-     mlir::populateMathToLLVMConversionPatterns(typeConverter, patterns);
--    mlir::populateStdToLLVMConversionPatterns(typeConverter, patterns);
-+    mlir::cf::populateControlFlowToLLVMConversionPatterns(typeConverter,
-+                                                          patterns);
-     mlir::populateGpuToNVVMConversionPatterns(typeConverter, patterns);
- 
-     if (failed(applyPartialConversion(mod, target, std::move(patterns))))
-@@ -306,9 +310,11 @@ class ConvertTritonGPUToLLVM
-     });
-   }
- 
--  void decomposeInsertSliceAsyncOp(ModuleOp mod) const {
--    AxisInfoAnalysis axisInfoAnalysis(mod.getContext());
--    axisInfoAnalysis.run(mod);
-+  LogicalResult decomposeInsertSliceAsyncOp(ModuleOp mod) const {
-+    std::unique_ptr<DataFlowSolver> solver = createDataFlowSolver();
-+    AxisInfoAnalysis *axisInfoAnalysis = solver->load<AxisInfoAnalysis>();
-+    if (failed(solver->initializeAndRun(mod)))
-+      return failure();
-     // TODO(Keren): This is a hacky knob that may cause performance regression
-     // when decomposition has been performed. We should remove this knob once we
-     // have thorough analysis on async wait. Currently, we decompose
-@@ -342,7 +348,7 @@ class ConvertTritonGPUToLLVM
-       auto resSharedLayout =
-           dstTy.getEncoding().dyn_cast<triton::gpu::SharedEncodingAttr>();
-       auto resElemTy = dstTy.getElementType();
--      unsigned inVec = axisInfoAnalysis.getPtrContiguity(src);
-+      unsigned inVec = axisInfoAnalysis->getPtrContiguity(src);
-       unsigned outVec = resSharedLayout.getVec();
-       unsigned minVec = std::min(outVec, inVec);
-       auto maxBitWidth =
-@@ -400,11 +406,11 @@ class ConvertTritonGPUToLLVM
-       } else if (decomposed) {
-         // Wait for all previous async ops
-         OpBuilder builder(asyncWaitOp);
--        auto newAsyncWaitOp =
--            builder.create<triton::gpu::AsyncWaitOp>(asyncWaitOp.getLoc(), 0);
-+        builder.create<triton::gpu::AsyncWaitOp>(asyncWaitOp.getLoc(), 0);
-         asyncWaitOp.erase();
-       }
-     });
-+    return success();
-   }
- };
- 
-diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.h b/lib/Conversion/TritonGPUToLLVM/Utility.h
-index d35dac28c5..11976908cf 100644
---- a/lib/Conversion/TritonGPUToLLVM/Utility.h
-+++ b/lib/Conversion/TritonGPUToLLVM/Utility.h
-@@ -220,10 +220,7 @@ struct SharedMemoryObject {
-                      ConversionPatternRewriter &rewriter)
-       : base(base) {
-     strides = getStridesFromShapeAndOrder(shape, order, loc, rewriter);
--
--    for (auto idx : order) {
--      offsets.emplace_back(i32_val(0));
--    }
-+    offsets.append(order.size(), i32_val(0));
-   }
- 
-   SmallVector<Value> getElems() const {
-diff --git a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
-index fe42202c34..5f230f787f 100644
---- a/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
-+++ b/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp
-@@ -1,10 +1,10 @@
- #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h"
- 
- #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
--#include "mlir/Dialect/GPU/GPUDialect.h"
-+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
- #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
- #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
--#include "mlir/Dialect/StandardOps/IR/Ops.h"
- #include "mlir/Pass/Pass.h"
- #include "mlir/Transforms/DialectConversion.h"
- #include "triton/Dialect/Triton/IR/Dialect.h"
-@@ -59,10 +59,13 @@ class ArithConstantPattern : public OpConversionPattern<arith::ConstantOp> {
-     Type retType = getTypeConverter()->convertType(op.getType());
-     auto value = adaptor.getValue().dyn_cast<DenseElementsAttr>();
-     assert(value);
--    rewriter.replaceOpWithNewOp<arith::ConstantOp>(
--        op, retType,
--        value.reshape(retType) // This is a hack. We just want to add encoding
--    );
-+    if (value.getElementType().isInteger(1) && value.isSplat())
-+      // Workaround until https://reviews.llvm.org/D133743 is included.
-+      value = DenseElementsAttr::get(retType, value.getSplatValue<bool>());
-+    else
-+      // This is a hack. We just want to add encoding
-+      value = value.reshape(retType);
-+    rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, retType, value);
-     return success();
-   }
- };
-@@ -127,12 +130,12 @@ void populateArithmeticPatternsAndLegality(
- }
- 
- // this shouldn't exist if mlir's SelectOp checked encodings properly
--class StdSelectPattern : public OpConversionPattern<SelectOp> {
-+class StdSelectPattern : public OpConversionPattern<arith::SelectOp> {
- public:
--  using OpConversionPattern<SelectOp>::OpConversionPattern;
-+  using OpConversionPattern<arith::SelectOp>::OpConversionPattern;
- 
-   LogicalResult
--  matchAndRewrite(SelectOp op, typename SelectOp::Adaptor adaptor,
-+  matchAndRewrite(arith::SelectOp op, OpAdaptor adaptor,
-                   ConversionPatternRewriter &rewriter) const override {
-     Type retType = this->getTypeConverter()->convertType(op.getType());
-     rewriter.replaceOpWithNewOp<triton::gpu::SelectOp>(
-@@ -148,8 +151,8 @@ void populateStdPatternsAndLegality(TritonGPUTypeConverter &typeConverter,
-   MLIRContext *context = patterns.getContext();
-   // Rewrite rule
-   patterns.add<StdSelectPattern>(typeConverter, context);
--  target.addLegalOp<ReturnOp>(); // this is ok because all functions are inlined
--                                 // by the frontend
-+  target.addLegalOp<func::ReturnOp>(); // this is ok because all functions are
-+                                       // inlined by the frontend
- }
- 
- void populateMathPatternsAndLegality(TritonGPUTypeConverter &typeConverter,
-@@ -455,18 +458,19 @@ struct TritonPrintfPattern : public OpConversionPattern<triton::PrintfOp> {
- void populateTritonPatterns(TritonGPUTypeConverter &typeConverter,
-                             RewritePatternSet &patterns) {
-   MLIRContext *context = patterns.getContext();
--  patterns.add< // TODO: view should have custom pattern that views the layout
--      TritonGenericPattern<triton::ViewOp>,
--      TritonGenericPattern<triton::BitcastOp>,
--      TritonGenericPattern<triton::FpToFpOp>,
--      TritonGenericPattern<triton::IntToPtrOp>,
--      TritonGenericPattern<triton::PtrToIntOp>,
--      TritonGenericPattern<triton::SplatOp>, TritonBroadcastPattern,
--      TritonGenericPattern<triton::AddPtrOp>, TritonCatPattern,
--      TritonReducePattern, TritonTransPattern, TritonExpandDimsPattern,
--      TritonMakeRangePattern, TritonDotPattern, TritonLoadPattern,
--      TritonStorePattern, TritonExtElemwisePattern, TritonPrintfPattern,
--      TritonAtomicRMWPattern>(typeConverter, context);
-+  patterns
-+      .insert< // TODO: view should have custom pattern that views the layout
-+          TritonGenericPattern<triton::ViewOp>,
-+          TritonGenericPattern<triton::BitcastOp>,
-+          TritonGenericPattern<triton::FpToFpOp>,
-+          TritonGenericPattern<triton::IntToPtrOp>,
-+          TritonGenericPattern<triton::PtrToIntOp>,
-+          TritonGenericPattern<triton::SplatOp>, TritonBroadcastPattern,
-+          TritonGenericPattern<triton::AddPtrOp>, TritonCatPattern,
-+          TritonReducePattern, TritonTransPattern, TritonExpandDimsPattern,
-+          TritonMakeRangePattern, TritonDotPattern, TritonLoadPattern,
-+          TritonStorePattern, TritonExtElemwisePattern, TritonPrintfPattern,
-+          TritonAtomicRMWPattern>(typeConverter, context);
- }
- 
- //
-@@ -623,29 +627,28 @@ void populateSCFPatterns(TritonGPUTypeConverter &typeConverter,
- 
- // CF
- 
--class CFBranchPattern : public OpConversionPattern<BranchOp> {
-+class CFBranchPattern : public OpConversionPattern<cf::BranchOp> {
- public:
--  using OpConversionPattern<BranchOp>::OpConversionPattern;
-+  using OpConversionPattern<cf::BranchOp>::OpConversionPattern;
- 
-   LogicalResult
--  matchAndRewrite(BranchOp op, BranchOp::Adaptor adaptor,
-+  matchAndRewrite(cf::BranchOp op, cf::BranchOp::Adaptor adaptor,
-                   ConversionPatternRewriter &rewriter) const override {
--    auto converter = getTypeConverter();
--    auto newOp = rewriter.replaceOpWithNewOp<BranchOp>(op, op.getSuccessor(),
--                                                       adaptor.getOperands());
-+    auto newOp = rewriter.replaceOpWithNewOp<cf::BranchOp>(
-+        op, op.getSuccessor(), adaptor.getOperands());
-     return success();
-   }
- };
- 
--class CFCondBranchPattern : public OpConversionPattern<CondBranchOp> {
-+class CFCondBranchPattern : public OpConversionPattern<cf::CondBranchOp> {
- public:
--  using OpConversionPattern<CondBranchOp>::OpConversionPattern;
-+  using OpConversionPattern<cf::CondBranchOp>::OpConversionPattern;
- 
-   LogicalResult
--  matchAndRewrite(CondBranchOp op, CondBranchOp::Adaptor adaptor,
-+  matchAndRewrite(cf::CondBranchOp op, cf::CondBranchOp::Adaptor adaptor,
-                   ConversionPatternRewriter &rewriter) const override {
-     auto converter = getTypeConverter();
--    auto newOp = rewriter.replaceOpWithNewOp<CondBranchOp>(
-+    auto newOp = rewriter.replaceOpWithNewOp<cf::CondBranchOp>(
-         op, adaptor.getCondition(), op.getTrueDest(),
-         adaptor.getTrueDestOperands(), op.getFalseDest(),
-         adaptor.getFalseDestOperands());
-diff --git a/lib/Dialect/Triton/IR/CMakeLists.txt b/lib/Dialect/Triton/IR/CMakeLists.txt
-index 2d679b21fd..705554ba6b 100644
---- a/lib/Dialect/Triton/IR/CMakeLists.txt
-+++ b/lib/Dialect/Triton/IR/CMakeLists.txt
-@@ -10,11 +10,7 @@ add_mlir_dialect_library(TritonIR
- 
-   LINK_LIBS PUBLIC
-   MLIRIR
--  MLIRArithmetic
--  MLIRSCF
--
--  # Since LLVM 15
--  # MLIRFunc
--  # else
--  MLIRStandard
-+  MLIRArithmeticDialect
-+  MLIRSCFDialect
-+  MLIRFuncDialect
- )
-diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
-index 3aadbfa0c0..86570359c5 100644
---- a/lib/Dialect/Triton/IR/Ops.cpp
-+++ b/lib/Dialect/Triton/IR/Ops.cpp
-@@ -1,10 +1,9 @@
--#include "triton/Dialect/Triton/IR/Dialect.h"
--#include "triton/Dialect/Triton/IR/Types.h"
--
- #include "mlir/IR/Builders.h"
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/IR/BuiltinTypes.h"
- #include "mlir/IR/OperationSupport.h"
-+#include "triton/Dialect/Triton/IR/Dialect.h"
-+#include "triton/Dialect/Triton/IR/Types.h"
- 
- namespace mlir {
- namespace triton {
-@@ -38,8 +37,8 @@ static Type getPointerTypeSameShape(Type type) {
- }
- 
- // Parser & printer for assembly forms
--ParseResult parseLoadOp(OpAsmParser &parser, OperationState &result) {
--  SmallVector<OpAsmParser::OperandType, 4> allOperands;
-+ParseResult LoadOp::parse(OpAsmParser &parser, OperationState &result) {
-+  SmallVector<OpAsmParser::UnresolvedOperand, 4> allOperands;
-   Type resultTypes[1];
-   SMLoc allOperandLoc = parser.getCurrentLocation();
-   if (parser.parseOperandList(allOperands) ||
-@@ -73,18 +72,18 @@ ParseResult parseLoadOp(OpAsmParser &parser, OperationState &result) {
-   return success();
- }
- 
--void printLoadOp(OpAsmPrinter &printer, LoadOp loadOp) {
-+void LoadOp::print(OpAsmPrinter &printer) {
-   printer << " ";
--  printer << loadOp.getOperation()->getOperands();
-+  printer << getOperation()->getOperands();
-   // "operand_segment_sizes" can be deduced, so we don't print it.
--  printer.printOptionalAttrDict(loadOp->getAttrs(),
--                                {loadOp.operand_segment_sizesAttrName()});
-+  printer.printOptionalAttrDict(getOperation()->getAttrs(),
-+                                {operand_segment_sizesAttrName()});
-   printer << " : ";
--  printer.printStrippedAttrOrType(loadOp.result().getType());
-+  printer.printStrippedAttrOrType(getResult().getType());
- }
- 
--ParseResult parseStoreOp(OpAsmParser &parser, OperationState &result) {
--  SmallVector<OpAsmParser::OperandType, 4> allOperands;
-+ParseResult StoreOp::parse(OpAsmParser &parser, OperationState &result) {
-+  SmallVector<OpAsmParser::UnresolvedOperand, 4> allOperands;
-   Type valueType;
-   SMLoc allOperandLoc = parser.getCurrentLocation();
-   if (parser.parseOperandList(allOperands) ||
-@@ -104,12 +103,12 @@ ParseResult parseStoreOp(OpAsmParser &parser, OperationState &result) {
-   return success();
- }
- 
--void printStoreOp(OpAsmPrinter &printer, StoreOp storeOp) {
-+void StoreOp::print(OpAsmPrinter &printer) {
-   printer << " ";
--  printer << storeOp.getOperation()->getOperands();
--  printer.printOptionalAttrDict(storeOp->getAttrs(), /*elidedAttrs=*/{});
-+  printer << getOperation()->getOperands();
-+  printer.printOptionalAttrDict(getOperation()->getAttrs(), /*elidedAttrs=*/{});
-   printer << " : ";
--  printer.printStrippedAttrOrType(storeOp.value().getType());
-+  printer.printStrippedAttrOrType(value().getType());
- }
- 
- } // namespace triton
-@@ -319,7 +318,8 @@ OpFoldResult SplatOp::fold(ArrayRef<Attribute> operands) {
-   if (!constOperand)
-     return {};
-   auto shapedType = getType().cast<ShapedType>();
--  auto ret = SplatElementsAttr::get(shapedType, {constOperand.getValue()});
-+  auto ret = SplatElementsAttr::get(
-+      shapedType, ArrayRef<Attribute>(constOperand.getValue()));
-   return ret;
- }
- 
-diff --git a/lib/Dialect/Triton/Transforms/Combine.cpp b/lib/Dialect/Triton/Transforms/Combine.cpp
-index 2261472170..11570283d6 100644
---- a/lib/Dialect/Triton/Transforms/Combine.cpp
-+++ b/lib/Dialect/Triton/Transforms/Combine.cpp
-@@ -57,13 +57,13 @@ DenseElementsAttr getConstantValue(Builder &builder, Attribute value,
- class CombineSelectMaskedLoadPattern : public mlir::RewritePattern {
- public:
-   CombineSelectMaskedLoadPattern(mlir::MLIRContext *context)
--      : mlir::RewritePattern(mlir::SelectOp::getOperationName(), 3, context,
--                             {triton::LoadOp::getOperationName()}) {}
-+      : mlir::RewritePattern(mlir::arith::SelectOp::getOperationName(), 3,
-+                             context, {triton::LoadOp::getOperationName()}) {}
- 
-   mlir::LogicalResult
-   matchAndRewrite(mlir::Operation *op,
-                   mlir::PatternRewriter &rewriter) const override {
--    auto selectOp = llvm::dyn_cast<mlir::SelectOp>(op);
-+    auto selectOp = llvm::dyn_cast<mlir::arith::SelectOp>(op);
-     if (!selectOp)
-       return mlir::failure();
- 
-diff --git a/lib/Dialect/Triton/Transforms/Combine.td b/lib/Dialect/Triton/Transforms/Combine.td
-index 14f286b26e..ded0e346e6 100644
---- a/lib/Dialect/Triton/Transforms/Combine.td
-+++ b/lib/Dialect/Triton/Transforms/Combine.td
-@@ -1,9 +1,9 @@
- #ifndef TRITON_PATTERNS
- #define TRITON_PATTERNS
- 
--include "mlir/Dialect/StandardOps/IR/Ops.td"
- include "mlir/Dialect/Arithmetic/IR/ArithmeticOps.td"
- include "triton/Dialect/Triton/IR/TritonOps.td"
-+include "mlir/IR/PatternBase.td"
- 
- 
- // AddIOp(DotOp(a, b, c), d) and c==0 => DotOp(a, b, d)
-diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-index 1fbc609e88..bfc3f3d3da 100644
---- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
-+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-@@ -1,14 +1,14 @@
-+#include "triton/Dialect/Triton/IR/Dialect.h"
-+
- #include <numeric>
- 
- #include "mlir/IR/DialectImplementation.h"
- #include "mlir/IR/OpImplementation.h"
- #include "triton/Analysis/Utility.h"
--#include "triton/Dialect/Triton/IR/Dialect.h"
-+#include "triton/Dialect/TritonGPU/IR/Dialect.cpp.inc"
- #include "triton/Dialect/TritonGPU/IR/Dialect.h"
- #include "llvm/ADT/TypeSwitch.h"
- 
--#include "triton/Dialect/TritonGPU/IR/Dialect.cpp.inc"
--
- using namespace mlir;
- using namespace mlir::triton::gpu;
- 
-@@ -366,7 +366,6 @@ template SmallVector<int64_t>
- SliceEncodingAttr::paddedShape<int64_t>(ArrayRef<int64_t> shape) const;
- 
- unsigned SliceEncodingAttr::getElemsPerThread(ArrayRef<int64_t> shape) const {
--  size_t rank = shape.size();
-   auto parent = getParent();
-   return ::getElemsPerThread(parent, paddedShape(shape));
- }
-@@ -655,9 +654,9 @@ void DotOperandEncodingAttr::print(mlir::AsmPrinter &printer) const {
- // InsertSliceAsyncOp
- //===----------------------------------------------------------------------===//
- 
--ParseResult parseInsertSliceAsyncOp(OpAsmParser &parser,
--                                    OperationState &result) {
--  SmallVector<OpAsmParser::OperandType, 8> allOperands;
-+ParseResult InsertSliceAsyncOp::parse(OpAsmParser &parser,
-+                                      OperationState &result) {
-+  SmallVector<OpAsmParser::UnresolvedOperand, 8> allOperands;
-   Type srcType, dstType;
-   SMLoc allOperandLoc = parser.getCurrentLocation();
-   if (parser.parseOperandList(allOperands) ||
-@@ -696,18 +695,16 @@ ParseResult parseInsertSliceAsyncOp(OpAsmParser &parser,
-   return success();
- }
- 
--void printInsertSliceAsyncOp(OpAsmPrinter &printer,
--                             InsertSliceAsyncOp insertSliceAsyncOp) {
-+void InsertSliceAsyncOp::print(OpAsmPrinter &printer) {
-   printer << " ";
--  printer << insertSliceAsyncOp.getOperation()->getOperands();
-+  printer << getOperation()->getOperands();
-   // "operand_segment_sizes" can be deduced, so we don't print it.
--  printer.printOptionalAttrDict(
--      insertSliceAsyncOp->getAttrs(),
--      {insertSliceAsyncOp.operand_segment_sizesAttrName()});
-+  printer.printOptionalAttrDict(getOperation()->getAttrs(),
-+                                {operand_segment_sizesAttrName()});
-   printer << " : ";
--  printer.printStrippedAttrOrType(insertSliceAsyncOp.src().getType());
-+  printer.printStrippedAttrOrType(src().getType());
-   printer << " -> ";
--  printer.printStrippedAttrOrType(insertSliceAsyncOp.result().getType());
-+  printer.printStrippedAttrOrType(result().getType());
- }
- 
- //===----------------------------------------------------------------------===//
-diff --git a/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp b/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp
-index 82407980d3..ee6009f44a 100644
---- a/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp
-@@ -27,7 +27,11 @@ struct CoalescePass : public TritonGPUCoalesceBase<CoalescePass> {
-     auto origType = ptr.getType().cast<RankedTensorType>();
-     // Get the shape of the tensor.
-     size_t rank = origType.getRank();
--    AxisInfo info = axisInfo.lookupLatticeElement(ptr)->getValue();
-+    dataflow::Lattice<AxisInfo> *latticeElement =
-+        axisInfo.getLatticeElement(ptr);
-+    AxisInfo info = latticeElement && !latticeElement->isUninitialized()
-+                        ? latticeElement->getValue()
-+                        : AxisInfo();
-     // Get the contiguity order of `ptr`
-     auto order = argSort(info.getContiguity());
-     // The desired divisibility is the maximum divisibility
-@@ -40,7 +44,7 @@ struct CoalescePass : public TritonGPUCoalesceBase<CoalescePass> {
-         for (Value val : op->getResults()) {
-           if (val.getType() != origType)
-             continue;
--          auto valInfo = axisInfo.lookupLatticeElement(val);
-+          auto valInfo = axisInfo.getLatticeElement(val);
-           auto currOrder = argSort(valInfo->getValue().getContiguity());
-           if (order == currOrder)
-             withSameOrder.insert(val);
-@@ -55,7 +59,7 @@ struct CoalescePass : public TritonGPUCoalesceBase<CoalescePass> {
-     unsigned elemNumBytes = std::max(elemNumBits / 8, 1u);
-     unsigned perThread = 1;
-     for (Value val : withSameOrder) {
--      AxisInfo info = axisInfo.lookupLatticeElement(val)->getValue();
-+      AxisInfo info = axisInfo.getLatticeElement(val)->getValue();
-       unsigned maxMultipleBytes = info.getDivisibility(order[0]);
-       unsigned maxMultiple = std::max(maxMultipleBytes / elemNumBytes, 1u);
-       unsigned maxContig = info.getContiguity(order[0]);
-@@ -123,8 +127,10 @@ struct CoalescePass : public TritonGPUCoalesceBase<CoalescePass> {
-   void runOnOperation() override {
-     Operation *op = getOperation();
-     // Run axis info analysis
--    AxisInfoAnalysis axisInfo(&getContext());
--    axisInfo.run(op);
-+    std::unique_ptr<DataFlowSolver> solver = createDataFlowSolver();
-+    AxisInfoAnalysis *axisInfo = solver->load<AxisInfoAnalysis>();
-+    if (failed(solver->initializeAndRun(op)))
-+      return signalPassFailure();
- 
-     // For each i/o operation, we determine what layout
-     // the pointers should have for best memory coalescing
-@@ -146,10 +152,10 @@ struct CoalescePass : public TritonGPUCoalesceBase<CoalescePass> {
-       RankedTensorType ty = ptr.getType().template dyn_cast<RankedTensorType>();
-       if (!ty || !ty.getElementType().isa<PointerType>())
-         return;
--      AxisInfo info = axisInfo.lookupLatticeElement(ptr)->getValue();
-+      AxisInfo info = axisInfo->getLatticeElement(ptr)->getValue();
-       auto mod = curr->getParentOfType<ModuleOp>();
-       int numWarps = triton::gpu::TritonGPUDialect::getNumWarps(mod);
--      auto convertType = getTypeConverter(axisInfo, ptr, numWarps);
-+      auto convertType = getTypeConverter(*axisInfo, ptr, numWarps);
-       layoutMap[ptr] = convertType;
-     });
- 
-diff --git a/lib/Dialect/TritonGPU/Transforms/Combine.cpp b/lib/Dialect/TritonGPU/Transforms/Combine.cpp
-index efa37ff2dc..089ce3996c 100644
---- a/lib/Dialect/TritonGPU/Transforms/Combine.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/Combine.cpp
-@@ -1,6 +1,6 @@
- #include "Utility.h"
- #include "mlir/Analysis/SliceAnalysis.h"
--#include "mlir/Dialect/SCF/SCF.h"
-+#include "mlir/Dialect/SCF/IR/SCF.h"
- #include "mlir/IR/BlockAndValueMapping.h"
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/IR/Matchers.h"
-diff --git a/lib/Dialect/TritonGPU/Transforms/Combine.td b/lib/Dialect/TritonGPU/Transforms/Combine.td
-index 6bf1b14866..6a7b10dbcb 100644
---- a/lib/Dialect/TritonGPU/Transforms/Combine.td
-+++ b/lib/Dialect/TritonGPU/Transforms/Combine.td
-@@ -3,5 +3,6 @@
- 
- include "triton/Dialect/TritonGPU/IR/TritonGPUOps.td"
- include "triton/Dialect/Triton/IR/TritonOps.td"
-+include "mlir/IR/PatternBase.td"
- 
- #endif
-diff --git a/lib/Dialect/TritonGPU/Transforms/DecomposeConversions.cpp b/lib/Dialect/TritonGPU/Transforms/DecomposeConversions.cpp
-index 4bd3bc76bf..b2f8defd81 100644
---- a/lib/Dialect/TritonGPU/Transforms/DecomposeConversions.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/DecomposeConversions.cpp
-@@ -1,5 +1,5 @@
- #include "mlir/Analysis/SliceAnalysis.h"
--#include "mlir/Dialect/SCF/SCF.h"
-+#include "mlir/Dialect/SCF/IR/SCF.h"
- #include "mlir/IR/BlockAndValueMapping.h"
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/IR/Matchers.h"
-diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
-index 9b2f42231e..85f746c1dc 100644
---- a/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
-@@ -2,6 +2,7 @@
- #include "mlir/IR/BlockAndValueMapping.h"
- #include "mlir/IR/TypeUtilities.h"
- #include "triton/Analysis/AxisInfo.h"
-+#include "triton/Analysis/Utility.h"
- #include "triton/Dialect/TritonGPU/IR/Dialect.h"
- #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
- 
-@@ -160,15 +161,18 @@ ttg::AllocTensorOp LoopPipeliner::allocateEmptyBuffer(Operation *op,
- LogicalResult LoopPipeliner::initialize() {
-   Block *loop = forOp.getBody();
- 
--  AxisInfoAnalysis axisInfoAnalysis(forOp.getContext());
--  axisInfoAnalysis.run(forOp->getParentOfType<ModuleOp>());
-+  std::unique_ptr<DataFlowSolver> solver = createDataFlowSolver();
-+  AxisInfoAnalysis *axisInfoAnalysis = solver->load<AxisInfoAnalysis>();
-+  if (failed(solver->initializeAndRun(forOp->getParentOfType<ModuleOp>()))) {
-+    return failure();
-+  }
- 
-   // can we use forOp.walk(...) here?
-   SmallVector<triton::LoadOp, 2> allLoads;
-   for (Operation &op : *loop)
-     if (auto loadOp = dyn_cast<triton::LoadOp>(&op)) {
-       auto ptr = loadOp.ptr();
--      unsigned vec = axisInfoAnalysis.getPtrContiguity(ptr);
-+      unsigned vec = axisInfoAnalysis->getPtrContiguity(ptr);
-       auto tensorTy = ptr.getType().dyn_cast<RankedTensorType>();
-       if (!tensorTy)
-         continue;
-diff --git a/lib/Dialect/TritonGPU/Transforms/ReorderInstructions.cpp b/lib/Dialect/TritonGPU/Transforms/ReorderInstructions.cpp
-index 0e7dbe5264..b95a4f50a6 100644
---- a/lib/Dialect/TritonGPU/Transforms/ReorderInstructions.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/ReorderInstructions.cpp
-@@ -1,5 +1,5 @@
- #include "mlir/Analysis/SliceAnalysis.h"
--#include "mlir/Dialect/SCF/SCF.h"
-+#include "mlir/Dialect/SCF/IR/SCF.h"
- #include "mlir/IR/BlockAndValueMapping.h"
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/IR/Matchers.h"
-diff --git a/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp b/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp
-index 37ac710995..762e887f36 100644
---- a/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp
-@@ -82,12 +82,12 @@ TritonGPUConversionTarget::TritonGPUConversionTarget(
-                scf::ReduceReturnOp>();
- 
-   addDynamicallyLegalDialect<arith::ArithmeticDialect, math::MathDialect,
--                             triton::TritonDialect, StandardOpsDialect,
--                             scf::SCFDialect>([&](Operation *op) {
--    if (typeConverter.isLegal(op))
--      return true;
--    return false;
--  });
-+                             triton::TritonDialect, scf::SCFDialect>(
-+      [&](Operation *op) {
-+        if (typeConverter.isLegal(op))
-+          return true;
-+        return false;
-+      });
- 
-   // We have requirements for the data layouts
-   addDynamicallyLegalOp<triton::DotOp>([](triton::DotOp dotOp) -> bool {
-diff --git a/lib/Dialect/TritonGPU/Transforms/UpdateMmaForVolta.cpp b/lib/Dialect/TritonGPU/Transforms/UpdateMmaForVolta.cpp
-index c229104286..c911fd4a5c 100644
---- a/lib/Dialect/TritonGPU/Transforms/UpdateMmaForVolta.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/UpdateMmaForVolta.cpp
-@@ -1,5 +1,5 @@
- #include "Utility.h"
--#include "mlir/Dialect/SCF/SCF.h"
-+#include "mlir/Dialect/SCF/IR/SCF.h"
- #include "mlir/IR/Matchers.h"
- #include "mlir/IR/PatternMatch.h"
- #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-@@ -118,8 +118,8 @@ void setOpResultType(Operation *op, ArrayRef<Type> newTypes) {
-                     .get("value")
-                     .dyn_cast<mlir::DenseElementsAttr>();
-     if (attr) {
--      auto newAttr = mlir::DenseElementsAttr::getFromRawBuffer(
--          newType, attr.getRawData(), true);
-+      auto newAttr =
-+          mlir::DenseElementsAttr::getFromRawBuffer(newType, attr.getRawData());
-       op->setAttr("value", newAttr);
-     }
-   }
-diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
-index ed15f02f67..6400f1633a 100644
---- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp
-@@ -1,5 +1,5 @@
- #include "Utility.h"
--#include "mlir/Dialect/SCF/SCF.h"
-+#include "mlir/Dialect/SCF/IR/SCF.h"
- #include "mlir/IR/BlockAndValueMapping.h"
- #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
- 
-diff --git a/lib/Target/LLVMIR/CMakeLists.txt b/lib/Target/LLVMIR/CMakeLists.txt
-index f1bbd0bf4e..ac8973ad19 100644
---- a/lib/Target/LLVMIR/CMakeLists.txt
-+++ b/lib/Target/LLVMIR/CMakeLists.txt
-@@ -6,8 +6,7 @@ add_mlir_translation_library(TritonLLVMIR
- 
-         LINK_LIBS PUBLIC
-         MLIRIR
--        MLIRLLVMIR
--        MLIRSCFToStandard
-+        MLIRLLVMDialect
-         MLIRSupport
-         MLIRTargetLLVMIRExport
-         )
-diff --git a/lib/Target/PTX/PTXTranslation.cpp b/lib/Target/PTX/PTXTranslation.cpp
-index 4cb0d8193c..6a5453a6e7 100644
---- a/lib/Target/PTX/PTXTranslation.cpp
-+++ b/lib/Target/PTX/PTXTranslation.cpp
-@@ -1,11 +1,14 @@
- #include "triton/Target/PTX/PTXTranslation.h"
- #include "triton/Target/LLVMIR/LLVMIRTranslation.h"
-+#include <optional>
- 
- #include "llvm/IR/IRBuilder.h"
- #include "llvm/IR/LegacyPassManager.h"
- #include "llvm/IR/Module.h"
- #include "llvm/IR/Verifier.h"
- #include "llvm/MC/TargetRegistry.h"
-+#include "llvm/Pass.h"
-+#include "llvm/Support/CommandLine.h"
- #include "llvm/Support/TargetSelect.h"
- #include "llvm/Target/TargetMachine.h"
- 
-diff --git a/python/setup.py b/python/setup.py
-index 2ac3accd25..4530b36714 100644
---- a/python/setup.py
-+++ b/python/setup.py
-@@ -57,19 +57,10 @@ def get_pybind11_package_info():
- def get_llvm_package_info():
-     # download if nothing is installed
-     system = platform.system()
--    if system == "Darwin":
--        system_suffix = "apple-darwin"
--    elif system == "Linux":
--        vglibc = tuple(map(int, platform.libc_ver()[1].split('.')))
--        vglibc = vglibc[0] * 100 + vglibc[1]
--        linux_suffix = 'ubuntu-18.04' if vglibc > 217 else 'centos-7'
--        system_suffix = f"linux-gnu-{linux_suffix}"
--    else:
--        raise RuntimeError(f"unsupported system: {system}")
-+    system_suffix = {"Linux": "linux-gnu-ubuntu-18.04", "Darwin": "apple-darwin"}[system]
-     use_assert_enabled_llvm = check_env_flag("TRITON_USE_ASSERT_ENABLED_LLVM", "False")
--    release_suffix = "assert" if use_assert_enabled_llvm else "release"
--    name = f'llvm+mlir-14.0.6-x86_64-{system_suffix}-{release_suffix}'
--    url = f"https://github.com/ptillet/triton-llvm-releases/releases/download/llvm-14.0.6-f28c006a5895/{name}.tar.xz"
-+    name = 'llvm+mlir-15.0.7-x86_64-{}-{}'.format(system_suffix, "assert" if use_assert_enabled_llvm else "release")
-+    url = "https://github.com/ptillet/triton-llvm-releases/releases/download/llvm-15.0.7-8dfdcc7b7bf6/{}.tar.xz".format(name)
-     return Package("llvm", name, url, "lib", "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR", "LLVM_SYSPATH")
- 
- 
-diff --git a/python/src/triton.cc b/python/src/triton.cc
-index c40b117a55..f190eacc34 100644
---- a/python/src/triton.cc
-+++ b/python/src/triton.cc
-@@ -8,9 +8,10 @@
- #include "mlir/Pass/PassManager.h"
- #include "mlir/Transforms/Passes.h"
- 
--#include "mlir/Parser.h"
-+#include "mlir/Parser/Parser.h"
- #include "mlir/Support/FileUtilities.h"
- 
-+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
- #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
- #include "triton/Analysis/Allocation.h"
- #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h"
-@@ -195,7 +196,7 @@ void init_triton_ir(py::module &&m) {
-                std::string attrName = name + "_arg" + std::to_string(id);
-                mlir::Block *owner = arg.getOwner();
-                if (owner->isEntryBlock() &&
--                   !mlir::isa<mlir::FuncOp>(owner->getParentOp())) {
-+                   !mlir::isa<mlir::func::FuncOp>(owner->getParentOp())) {
-                  owner->getParentOp()->setAttr(attrName, attr);
-                }
-              }
-@@ -348,7 +349,7 @@ void init_triton_ir(py::module &&m) {
-              return str;
-            })
-       .def("push_back",
--           [](mlir::ModuleOp &self, mlir::FuncOp &funcOp) -> void {
-+           [](mlir::ModuleOp &self, mlir::func::FuncOp &funcOp) -> void {
-              self.push_back(funcOp);
-            })
-       .def("has_function",
-@@ -358,16 +359,18 @@ void init_triton_ir(py::module &&m) {
-              return false;
-            })
-       .def("get_function",
--           [](mlir::ModuleOp &self, std::string &funcName) -> mlir::FuncOp {
--             return self.lookupSymbol<mlir::FuncOp>(funcName);
--           })
--      .def("get_single_function", [](mlir::ModuleOp &self) -> mlir::FuncOp {
--        llvm::SmallVector<mlir::FuncOp> funcs;
--        self.walk([&](mlir::FuncOp func) { funcs.push_back(func); });
--        if (funcs.size() != 1)
--          throw std::runtime_error("Expected a single function");
--        return funcs[0];
--      });
-+           [](mlir::ModuleOp &self,
-+              std::string &funcName) -> mlir::func::FuncOp {
-+             return self.lookupSymbol<mlir::func::FuncOp>(funcName);
-+           })
-+      .def("get_single_function",
-+           [](mlir::ModuleOp &self) -> mlir::func::FuncOp {
-+             llvm::SmallVector<mlir::func::FuncOp> funcs;
-+             self.walk([&](mlir::func::FuncOp func) { funcs.push_back(func); });
-+             if (funcs.size() != 1)
-+               throw std::runtime_error("Expected a single function");
-+             return funcs[0];
-+           });
- 
-   m.def("make_attr",
-         [](const std::vector<int> &values, mlir::MLIRContext &context) {
-@@ -388,47 +391,48 @@ void init_triton_ir(py::module &&m) {
-         registry.insert<mlir::triton::TritonDialect,
-                         mlir::triton::gpu::TritonGPUDialect,
-                         mlir::math::MathDialect, mlir::arith::ArithmeticDialect,
--                        mlir::StandardOpsDialect, mlir::scf::SCFDialect>();
-+                        mlir::func::FuncDialect, mlir::scf::SCFDialect>();
-         context.appendDialectRegistry(registry);
-         context.loadAllAvailableDialects();
- 
-         // parse module
--        mlir::OwningOpRef<mlir::ModuleOp> module(
--            mlir::parseSourceFile(inputFilename, &context));
-+        mlir::OwningOpRef<mlir::ModuleOp> module =
-+            mlir::parseSourceFile<mlir::ModuleOp>(inputFilename, &context);
-+        if (!module)
-+          throw std::runtime_error("Parse MLIR file failed.");
-         // locations are incompatible with ptx < 7.5 !
-         module->walk([](mlir::Operation *op) {
-           op->setLoc(mlir::UnknownLoc::get(op->getContext()));
-         });
--        if (!module)
--          throw std::runtime_error("Parse MLIR file failed.");
- 
-         return module->clone();
-       },
-       ret::take_ownership);
- 
--  py::class_<mlir::FuncOp, mlir::OpState>(m, "function")
-+  py::class_<mlir::func::FuncOp, mlir::OpState>(m, "function")
-       // .def_property_readonly("attrs", &ir::function::attrs)
-       // .def("add_attr", &ir::function::add_attr);
-       .def("args",
--           [](mlir::FuncOp &self, unsigned idx) -> mlir::BlockArgument {
-+           [](mlir::func::FuncOp &self, unsigned idx) -> mlir::BlockArgument {
-              return self.getArgument(idx);
-            })
-       .def(
-           "add_entry_block",
--          [](mlir::FuncOp &self) -> mlir::Block * {
-+          [](mlir::func::FuncOp &self) -> mlir::Block * {
-             return self.addEntryBlock();
-           },
-           ret::reference)
-       .def(
-           "set_arg_attr",
--          [](mlir::FuncOp &self, int arg_no, const std::string &name, int val) {
-+          [](mlir::func::FuncOp &self, int arg_no, const std::string &name,
-+             int val) {
-             // set arg attributes "name" to value "val"
-             auto attrTy = mlir::IntegerType::get(self.getContext(), 32);
-             self.setArgAttr(arg_no, name, mlir::IntegerAttr::get(attrTy, val));
-           },
-           ret::reference)
--      .def_property_readonly("type", &mlir::FuncOp::getType)
--      .def("reset_type", &mlir::FuncOp::setType);
-+      .def_property_readonly("type", &mlir::func::FuncOp::getFunctionType)
-+      .def("reset_type", &mlir::func::FuncOp::setType);
- 
-   py::class_<mlir::OpBuilder::InsertPoint>(m, "InsertPoint");
- 
-@@ -445,13 +449,13 @@ void init_triton_ir(py::module &&m) {
-       .def("ret",
-            [](mlir::OpBuilder &self, std::vector<mlir::Value> &vals) -> void {
-              auto loc = self.getUnknownLoc();
--             self.create<mlir::ReturnOp>(loc, vals);
-+             self.create<mlir::func::ReturnOp>(loc, vals);
-            })
-       .def("call",
--           [](mlir::OpBuilder &self, mlir::FuncOp &func,
-+           [](mlir::OpBuilder &self, mlir::func::FuncOp &func,
-               std::vector<mlir::Value> &args) -> mlir::OpState {
-              auto loc = self.getUnknownLoc();
--             return self.create<mlir::CallOp>(loc, func, args);
-+             return self.create<mlir::func::CallOp>(loc, func, args);
-            })
-       // insertion block/point
-       .def("set_insertion_point_to_start",
-@@ -618,15 +622,16 @@ void init_triton_ir(py::module &&m) {
-       .def("get_or_insert_function",
-            [](mlir::OpBuilder &self, mlir::ModuleOp &module,
-               std::string &funcName, mlir::Type &funcType,
--              std::string &visibility) -> mlir::FuncOp {
-+              std::string &visibility) -> mlir::func::FuncOp {
-              if (mlir::Operation *funcOperation = module.lookupSymbol(funcName))
--               return llvm::dyn_cast<mlir::FuncOp>(funcOperation);
-+               return llvm::dyn_cast<mlir::func::FuncOp>(funcOperation);
-              auto loc = self.getUnknownLoc();
-              if (auto funcTy = funcType.dyn_cast<mlir::FunctionType>()) {
-                llvm::SmallVector<mlir::NamedAttribute> attrs = {
-                    mlir::NamedAttribute(self.getStringAttr("sym_visibility"),
-                                         self.getStringAttr(visibility))};
--               return self.create<mlir::FuncOp>(loc, funcName, funcTy, attrs);
-+               return self.create<mlir::func::FuncOp>(loc, funcName, funcTy,
-+                                                      attrs);
-              }
-              throw std::runtime_error("invalid function type");
-            })
-@@ -658,15 +663,15 @@ void init_triton_ir(py::module &&m) {
-            [](mlir::OpBuilder &self, mlir::Value condition,
-               mlir::Block *trueDest, mlir::Block *falseDest) {
-              auto loc = self.getUnknownLoc();
--             self.create<mlir::CondBranchOp>(loc, condition, trueDest,
--                                             falseDest);
-+             self.create<mlir::cf::CondBranchOp>(loc, condition, trueDest,
-+                                                 falseDest);
-              return;
-            })
-       .def("create_branch",
-            [](mlir::OpBuilder &self, mlir::Block *dest,
-               std::vector<mlir::Value> &args) {
-              auto loc = self.getUnknownLoc();
--             self.create<mlir::BranchOp>(loc, dest, args);
-+             self.create<mlir::cf::BranchOp>(loc, dest, args);
-              return;
-            })
-       // Structured control flow
-@@ -792,14 +797,14 @@ void init_triton_ir(py::module &&m) {
-       .def("create_to_index",
-            [](mlir::OpBuilder &self, mlir::Value &input) -> mlir::Value {
-              auto loc = self.getUnknownLoc();
--             return self.create<mlir::arith::IndexCastOp>(loc, input,
--                                                          self.getIndexType());
-+             return self.create<mlir::arith::IndexCastOp>(
-+                 loc, self.getIndexType(), input);
-            })
-       .def("create_index_to_si",
-            [](mlir::OpBuilder &self, mlir::Value &input) -> mlir::Value {
-              auto loc = self.getUnknownLoc();
--             return self.create<mlir::arith::IndexCastOp>(loc, input,
--                                                          self.getI32Type());
-+             return self.create<mlir::arith::IndexCastOp>(
-+                 loc, self.getI32Type(), input);
-            })
-       .def("create_fmul",
-            [](mlir::OpBuilder &self, mlir::Value &lhs,
-@@ -1316,8 +1321,8 @@ void init_triton_ir(py::module &&m) {
-            [](mlir::OpBuilder &self, mlir::Value &condition,
-               mlir::Value &trueValue, mlir::Value &falseValue) -> mlir::Value {
-              auto loc = self.getUnknownLoc();
--             return self.create<mlir::SelectOp>(loc, condition, trueValue,
--                                                falseValue);
-+             return self.create<mlir::arith::SelectOp>(loc, condition,
-+                                                       trueValue, falseValue);
-            })
-       .def("create_printf",
-            [](mlir::OpBuilder &self, const std::string &prefix,
-@@ -1429,7 +1434,7 @@ void init_triton_ir(py::module &&m) {
-              self.addPass(mlir::triton::createConvertTritonGPUToLLVMPass());
-            })
-       .def("add_scf_to_cfg", [](mlir::PassManager &self) {
--        self.addPass(mlir::createLowerToCFGPass());
-+        self.addPass(mlir::createConvertSCFToCFPass());
-       });
- }
- 
-diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
-index 432544a8a4..018f544714 100644
---- a/python/test/unit/language/test_core.py
-+++ b/python/test/unit/language/test_core.py
-@@ -1918,7 +1918,7 @@ def test_convert2d(dtype, shape, src_layout, dst_layout, device='cuda'):
- #dst = {dst_layout}
- """ + """
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
--  func public @kernel_0d1d(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
-+  func.func public @kernel_0d1d(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
-     %cst = arith.constant dense<128> : tensor<128x1xi32, #src>
-     %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #src}>>
-     %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #src}>>
-diff --git a/python/triton/compiler.py b/python/triton/compiler.py
-index 5d167634df..c36589037c 100644
---- a/python/triton/compiler.py
-+++ b/python/triton/compiler.py
-@@ -1514,14 +1514,14 @@ def make_hash(fn, **kwargs):
-     return hashlib.md5((Path(fn).read_text() + triton.runtime.jit.version_key()).encode("utf-8")).hexdigest()
- 
- 
--# - ^\s*func\s+ : match the start of the string, any leading whitespace, the keyword func,
-+# - ^\s*func\.func\s+ : match the start of the string, any leading whitespace, the keyword func,
- #    and any following whitespace
- # - (public\s+)? : optionally match the keyword public and any following whitespace
- # - (@\w+) : match an @ symbol followed by one or more word characters
- #   (letters, digits, or underscores), and capture it as group 1 (the function name)
- # - (\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\)) : match a pair of parentheses enclosing
- #   zero or more arguments separated by commas, and capture it as group 2 (the argument list)
--mlir_prototype_pattern = r'^\s*func\s+(?:public\s+)?(@\w+)(\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\))\s*\{\s*$'
-+mlir_prototype_pattern = r'^\s*func\.func\s+(?:public\s+)?(@\w+)(\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\))\s*\{\s*$'
- ptx_prototype_pattern = r"\.(?:visible|extern)\s+\.(?:entry|func)\s+(\w+)\s*\(([^)]*)\)"
- prototype_pattern = {
-     "ttir": mlir_prototype_pattern,
-diff --git a/test/Analysis/test-alias.mlir b/test/Analysis/test-alias.mlir
-index b3d5673f85..bb21615e68 100644
---- a/test/Analysis/test-alias.mlir
-+++ b/test/Analysis/test-alias.mlir
-@@ -11,7 +11,7 @@
- 
- // CHECK-LABEL: matmul_loop
- // There shouldn't be any aliasing with the dot op encoding.
--func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-+func.func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-   %a_ptr_init = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<128x32x!tt.ptr<f16>, #AL>
-   %b_ptr_init = tt.broadcast %B : (!tt.ptr<f16>) -> tensor<32x128x!tt.ptr<f16>, #BL>
-   %a_mask = arith.constant dense<true> : tensor<128x32xi1, #AL>
-@@ -36,7 +36,7 @@ func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B
- }
- 
- // CHECK-LABEL: alloc
--func @alloc(%A : !tt.ptr<f16>) {
-+func.func @alloc(%A : !tt.ptr<f16>) {
-   // CHECK: %cst -> %cst
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
-@@ -46,7 +46,7 @@ func @alloc(%A : !tt.ptr<f16>) {
- }
- 
- // CHECK-LABEL: convert
--func @convert(%A : !tt.ptr<f16>) {
-+func.func @convert(%A : !tt.ptr<f16>) {
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-   // CHECK: %0 -> %0
-   %cst1 = triton_gpu.convert_layout %cst0 : (tensor<16x16xf16, #AL>) -> tensor<16x16xf16, #A_SHARED>
-@@ -54,7 +54,7 @@ func @convert(%A : !tt.ptr<f16>) {
- }
- 
- // CHECK-LABEL: trans
--func @trans(%A : !tt.ptr<f16>) {
-+func.func @trans(%A : !tt.ptr<f16>) {
-   // CHECK: %cst -> %cst
-   %tensor = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED>
-   // CHECK: %0 -> %cst
-@@ -63,7 +63,7 @@ func @trans(%A : !tt.ptr<f16>) {
- }
- 
- // CHECK-LABEL: insert_slice_async
--func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
-+func.func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
-   %a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
-   %mask = tt.splat %i1 : (i1) -> tensor<16x16xi1, #AL>
-   %other = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-@@ -76,7 +76,7 @@ func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
- }
- 
- // CHECK-LABEL: insert_slice
--func @insert_slice(%A : !tt.ptr<f16>, %i1 : i1) {
-+func.func @insert_slice(%A : !tt.ptr<f16>, %i1 : i1) {
-   %a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
-   %mask = tt.splat %i1 : (i1) -> tensor<16x16xi1, #AL>
-   %other = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-@@ -90,7 +90,7 @@ func @insert_slice(%A : !tt.ptr<f16>, %i1 : i1) {
- }
- 
- // CHECK-LABEL: extract_slice
--func @extract_slice(%A : !tt.ptr<f16>) {
-+func.func @extract_slice(%A : !tt.ptr<f16>) {
-   // CHECK: %cst -> %cst
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED>
-   %index = arith.constant 0 : index
-@@ -100,7 +100,7 @@ func @extract_slice(%A : !tt.ptr<f16>) {
- }
- 
- // CHECK-LABEL: if_cat
--func @if_cat(%i1 : i1) {
-+func.func @if_cat(%i1 : i1) {
-   // CHECK: %cst -> %cst
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   // CHECK: %cst_0 -> %cst_0
-@@ -119,7 +119,7 @@ func @if_cat(%i1 : i1) {
- }
- 
- // CHECK-LABEL: if_alias
--func @if_alias(%i1 : i1) {
-+func.func @if_alias(%i1 : i1) {
-   // CHECK: %cst -> %cst
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   // CHECK-NEXT: %cst_0 -> %cst_0
-@@ -134,7 +134,7 @@ func @if_alias(%i1 : i1) {
- }
- 
- // CHECK-LABEL: for
--func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-+func.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-   // CHECK: %cst -> %cst
-   %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   // CHECK-NEXT: %cst_0 -> %cst_0
-@@ -154,7 +154,7 @@ func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.p
- }
- 
- // CHECK-LABEL: for_if
--func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
-+func.func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
-   // CHECK: %cst -> %cst
-   %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   // CHECK-NEXT: %cst_0 -> %cst_0
-@@ -180,7 +180,7 @@ func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !t
- }
- 
- // CHECK-LABEL: for_if_for
--func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
-+func.func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
-   // CHECK: %cst -> %cst
-   %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   // CHECK-NEXT: %cst_0 -> %cst_0
-diff --git a/test/Analysis/test-alignment.mlir b/test/Analysis/test-alignment.mlir
-index 0ab34c7a78..af8ea6f856 100644
---- a/test/Analysis/test-alignment.mlir
-+++ b/test/Analysis/test-alignment.mlir
-@@ -1,288 +1,288 @@
--// RUN: triton-opt %s -test-print-alignment -split-input-file 2>&1 | FileCheck %s
-+// RUN: triton-opt %s -test-print-alignment -split-input-file -o %t 2>&1 | FileCheck %s
- 
--// CHECK-LABEL: cast
--func @cast() {
--  // CHECK: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [1]
-+// CHECK-LABEL: @cast
-+func.func @cast() {
-+  // CHECK: contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1
-   %cst = arith.constant 1 : i32
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [1]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = 1
-   %0 = arith.extsi %cst : i32 to i64
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
-   %cst_tensor = arith.constant dense<1> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
-   %1 = tt.bitcast %cst_tensor : tensor<128xi32> -> tensor<128xi64>
-   return
- }
- 
- // -----
- 
--// CHECK-LABEL: add
--func @add() {
--  // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+// CHECK-LABEL: @add
-+func.func @add() {
-+  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
-   %1 = arith.constant dense<1> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128], divisibility = [1], constancy = [1], constant_value = <none>
-   %2 = arith.addi %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [127]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 127
-   %3 = arith.constant dense<127> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [128] ; ConstantValue: [128]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128
-   %4 = arith.addi %1, %3 : tensor<128xi32>
-   return
- }
- 
- // -----
- 
--// CHECK-LABEL: sub
--func @sub() {
--  // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+// CHECK-LABEL: @sub
-+func.func @sub() {
-+  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
-   %1 = arith.constant dense<1> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128], divisibility = [1], constancy = [1], constant_value = <none>
-   %2 = arith.subi %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [129]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 129
-   %3 = arith.constant dense<129> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [128] ; ConstantValue: [128]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128
-   %4 = arith.subi %3, %1 : tensor<128xi32>
-   return
- }
- 
- // -----
- 
--// CHECK-LABEL: mul
--func @mul() {
--  // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+// CHECK-LABEL: @mul
-+func.func @mul() {
-+  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
-   %1 = arith.constant dense<1> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %2 = arith.muli %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [128] ; ConstantValue: [128]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128
-   %3 = arith.constant dense<128> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [128] ; ConstantValue: [128]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128
-   %4 = arith.muli %3, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [2] ; Constancy: [128] ; ConstantValue: [2]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [2], constancy = [128], constant_value = 2
-   %5 = arith.constant dense<2> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [256] ; Constancy: [128] ; ConstantValue: [256]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [256], constancy = [128], constant_value = 256
-   %6 = arith.muli %4, %5 : tensor<128xi32>
-   return
- }
- 
- // -----
- 
--// CHECK-LABEL: div
--func @div() {
--  // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+// CHECK-LABEL: @div
-+func.func @div() {
-+  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
-   %1 = arith.constant dense<1> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %2 = arith.divsi %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
-   %3 = arith.divui %1, %0 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [64] ; Constancy: [128] ; ConstantValue: [64]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64
-   %4 = arith.constant dense<64> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [16777216] ; Constancy: [64] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [16777216], constancy = [64], constant_value = <none>
-   %5 = arith.divsi %0, %4 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
-   %6 = arith.divsi %4, %0 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [64] ; Constancy: [128] ; ConstantValue: [64]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64
-   %7 = arith.divsi %4, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [2] ; Constancy: [128] ; ConstantValue: [66]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [2], constancy = [128], constant_value = 66
-   %8 = arith.constant dense<66> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [2] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [2], constant_value = <none>
-   %9 = arith.divui %0, %8 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [128] ; Divisibility: [8192] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128], divisibility = [8192], constancy = [1], constant_value = <none>
-   %10 = tt.make_range {end = 8320 : i32, start = 8192 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [64] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [64], constant_value = <none>
-   %11 = arith.divsi %10, %4 : tensor<128xi32>
--  return 
-+  return
- }
- 
- // -----
- 
--// CHECK-LABEL: rem
--func @rem() {
--  // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+// CHECK-LABEL: @rem
-+func.func @rem() {
-+  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [1]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 1
-   %1 = arith.constant dense<1> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [128] ; ConstantValue: [0]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0
-   %2 = arith.remsi %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
-   %3 = arith.remui %1, %0 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [64] ; Constancy: [128] ; ConstantValue: [64]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64
-   %4 = arith.constant dense<64> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [64] ; Divisibility: [64] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [64], divisibility = [64], constancy = [1], constant_value = <none>
-   %5 = arith.remsi %0, %4 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [64] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [64], constancy = [1], constant_value = <none>
-   %6 = arith.remsi %4, %0 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [2] ; Constancy: [128] ; ConstantValue: [66]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [2], constancy = [128], constant_value = 66
-   %7 = arith.constant dense<66> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [2] ; Divisibility: [2] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [2], divisibility = [2], constancy = [1], constant_value = <none>
-   %8 = arith.remui %0, %7 : tensor<128xi32>
--  return 
-+  return
- }
- 
- // -----
- 
--// CHECK-LABEL: broadcast
--func @broadcast() {
--  // CHECK: Contiguity: [1] ; Divisibility: [64] ; Constancy: [128] ; ConstantValue: [64]
-+// CHECK-LABEL: @broadcast
-+func.func @broadcast() {
-+  // CHECK: contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64
-   %0 = arith.constant dense<64> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [64, 1] ; Constancy: [128, 1] ; ConstantValue: [64]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [64, 1], constancy = [128, 1], constant_value = 64
-   %1 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<128xi32>) -> tensor<128x1xi32>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [64, 1] ; Constancy: [128, 128] ; ConstantValue: [64]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [64, 1], constancy = [128, 128], constant_value = 64
-   %2 = tt.broadcast %1 : (tensor<128x1xi32>) -> tensor<128x128xi32>
-   return
- }
- 
- // -----
- 
--// CHECK-LABEL: splat
--func @splat(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
--  // CHECK: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [128, 128] ; ConstantValue: [None]
-+// CHECK-LABEL: @splat
-+func.func @splat(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
-+  // CHECK: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 128], constant_value = <none>
-   %0 = tt.splat %arg0 : (!tt.ptr<f32>) -> tensor<128x128x!tt.ptr<f32>>
-   return
- }
- 
- // -----
- 
--// CHECK-LABEL: cmp
--func @cmp() {
--  // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+// CHECK-LABEL: @cmp
-+func.func @cmp() {
-+  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [128] ; ConstantValue: [0]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0
-   %1 = arith.constant dense<0> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>
-   %2 = arith.cmpi eq, %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>
-   %3 = arith.cmpi slt, %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
-   %4 = arith.cmpi sle, %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>
-   %5 = arith.cmpi sge, %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [8] ; Constancy: [128] ; ConstantValue: [8]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8
-   %6 = arith.constant dense<8> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [8] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>
-   %7 = arith.cmpi sgt, %0, %6 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [0]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = 0
-   %8 = arith.cmpi sgt, %1, %6 : tensor<128xi32>
-   return
- }
- 
- // -----
- 
--// CHECK-LABEL: logic
--func @logic() {
--  // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+// CHECK-LABEL: @logic
-+func.func @logic() {
-+  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [64] ; Constancy: [128] ; ConstantValue: [64]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [64], constancy = [128], constant_value = 64
-   %1 = arith.constant dense<64> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [16777216] ; Constancy: [64] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [16777216], constancy = [64], constant_value = <none>
-   %2 = arith.divsi %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [8] ; Constancy: [128] ; ConstantValue: [8]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8
-   %3 = arith.constant dense<8> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [134217728] ; Constancy: [8] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [134217728], constancy = [8], constant_value = <none>
-   %4 = arith.divsi %0, %3 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
-   %5 = arith.andi %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
-   %6 = arith.ori %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
-   %7 = arith.xori %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [8] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>
-   %8 = arith.andi %2, %4 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [8] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>
-   %9 = arith.ori %2, %4 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [8] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [8], constant_value = <none>
-   %10 = arith.xori %2, %4 : tensor<128xi32>
-   return
- }
- 
- // -----
- 
--// CHECK-LABEL: select
--func @select() {
--  // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+// CHECK-LABEL: @select
-+func.func @select() {
-+  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [128] ; ConstantValue: [0]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0
-   %1 = arith.constant dense<0> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>
-   %2 = arith.cmpi eq, %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>
-   %3 = arith.cmpi slt, %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [1] ; ConstantValue: [0]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0
-   %4 = arith.constant 0 : i1
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [128] ; ConstantValue: [0]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0
-   %7 = tt.splat %4 : (i1) -> tensor<128xi1>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [128] ; ConstantValue: [0]
--  %5 = select %4, %3, %7 : tensor<128xi1>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [128] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [128], constant_value = 0
-+  %5 = arith.select %4, %3, %7 : tensor<128xi1>
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [128], constant_value = <none>
-   %8 = "triton_gpu.select"(%7, %3, %2) : (tensor<128xi1>, tensor<128xi1>, tensor<128xi1>) -> tensor<128xi1>
-   return
- }
- 
- // -----
- 
--func @shift() {
--  // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+func.func @shift() {
-+  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [8] ; Constancy: [128] ; ConstantValue: [8]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8
-   %1 = arith.constant dense<8> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4] ; Constancy: [128] ; ConstantValue: [4]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [128], constant_value = 4
-   %2 = arith.constant dense<4> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [274877906944] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [274877906944], constancy = [1], constant_value = <none>
-   %3 = arith.shli %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [67108864] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [67108864], constancy = [1], constant_value = <none>
-   %4 = arith.shrsi %0, %2 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [128] ; ConstantValue: [128]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = 128
-   %5 = arith.shli %1, %2 : tensor<128xi32>
-   return
- }
- 
- // -----
- 
--func @max_min() {
--  // CHECK: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+func.func @max_min() {
-+  // CHECK: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [128] ; Divisibility: [64] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128], divisibility = [64], constancy = [1], constant_value = <none>
-   %1 = tt.make_range {end = 192 : i32, start = 64 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
-   %2 = arith.maxsi %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
-   %3 = arith.minsi %0, %1 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [8] ; Constancy: [128] ; ConstantValue: [8]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [8], constancy = [128], constant_value = 8
-   %4 = arith.constant dense<8> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4] ; Constancy: [128] ; ConstantValue: [4]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [4], constancy = [128], constant_value = 4
-   %5 = arith.constant dense<4> : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [8]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = 8
-   %6 = arith.maxsi %4, %5 : tensor<128xi32>
-   return
- }
- 
- // -----
- 
--// CHECK-LABEL: for
--func @for() {
--  // CHECK: Contiguity: [1, 1] ; Divisibility: [4611686018427387904, 4611686018427387904] ; Constancy: [128, 32] ; ConstantValue: [0]
-+// CHECK-LABEL: @for
-+func.func @for() {
-+  // CHECK: contiguity = [1, 1], divisibility = [4611686018427387904, 4611686018427387904], constancy = [128, 32], constant_value = 0
-   %a_init = arith.constant dense<0> : tensor<128x32xi32>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [1, 1] ; Constancy: [128, 32] ; ConstantValue: [1]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 32], constant_value = 1
-   %b_init = arith.constant dense<1> : tensor<128x32xi32>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [4, 4] ; Constancy: [128, 32] ; ConstantValue: [4]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [4, 4], constancy = [128, 32], constant_value = 4
-   %c_init = arith.constant dense<4> : tensor<128x32xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [1] ; ConstantValue: [128]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [1], constant_value = 128
-   %ub = arith.constant 128 : index
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [4611686018427387904] ; Constancy: [1] ; ConstantValue: [0]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [4611686018427387904], constancy = [1], constant_value = 0
-   %lb = arith.constant 0 : index
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [16] ; Constancy: [1] ; ConstantValue: [16]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [16], constancy = [1], constant_value = 16
-   %step = arith.constant 16 : index
-   %a, %b, %c = scf.for %iv = %lb to %ub step %step iter_args(%a = %a_init, %b = %b_init, %c = %c_init) -> (tensor<128x32xi32>, tensor<128x32xi32>, tensor<128x32xi32>) {
--    // CHECK-NEXT: Contiguity: [1] ; Divisibility: [16] ; Constancy: [1] ; ConstantValue: [None]
-+    // CHECK-NEXT: contiguity = [1], divisibility = [16], constancy = [1], constant_value = <none>
-     %t = arith.index_cast %iv : index to i32
--    // CHECK: Contiguity: [1, 1] ; Divisibility: [1, 1] ; Constancy: [128, 32] ; ConstantValue: [None]
--    // CHECK: Contiguity: [1, 1] ; Divisibility: [1, 1] ; Constancy: [128, 32] ; ConstantValue: [None]
--    // CHECK: Contiguity: [1, 1] ; Divisibility: [4, 4] ; Constancy: [128, 32] ; ConstantValue: [4]
-+    // CHECK: contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 32], constant_value = <none>
-+    // CHECK: contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 32], constant_value = <none>
-+    // CHECK: contiguity = [1, 1], divisibility = [4, 4], constancy = [128, 32], constant_value = 4
-     scf.yield %b, %a, %c : tensor<128x32xi32>, tensor<128x32xi32>, tensor<128x32xi32>
-   }
-   return
-@@ -290,53 +290,53 @@ func @for() {
- 
- // -----
- 
--// CHECK-LABEL: permute_2d
--func @permute_2d(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}) {
--  // CHECK: Contiguity: [1, 1] ; Divisibility: [1, 1] ; Constancy: [128, 128] ; ConstantValue: [1]
-+// CHECK-LABEL: @permute_2d
-+func.func @permute_2d(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}) {
-+  // CHECK: contiguity = [1, 1], divisibility = [1, 1], constancy = [128, 128], constant_value = 1
-   %cst = arith.constant dense<true> : tensor<128x128xi1>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [1, 1] ; Constancy: [1, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
-   %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32>
--  // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %0 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [128, 1] ; Divisibility: [1073741824, 1] ; Constancy: [1, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128, 1], divisibility = [1073741824, 1], constancy = [1, 1], constant_value = <none>
-   %2 = tt.expand_dims %0 {axis = 1 : i32} : (tensor<128xi32>) -> tensor<128x1xi32>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [128, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 1], constant_value = <none>
-   %3 = tt.splat %arg1 : (i32) -> tensor<128x1xi32>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [17179869184, 16] ; Constancy: [1, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [17179869184, 16], constancy = [1, 1], constant_value = <none>
-   %4 = arith.muli %2, %3 : tensor<128x1xi32>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [128, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 1], constant_value = <none>
-   %5 = tt.splat %arg0 : (!tt.ptr<f32>) -> tensor<128x1x!tt.ptr<f32>>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [1, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 1], constant_value = <none>
-   %6 = tt.addptr %5, %4 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32>
--  // CHECK-NEXT: Contiguity: [1, 128] ; Divisibility: [1, 1073741824] ; Constancy: [1, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 128], divisibility = [1, 1073741824], constancy = [1, 1], constant_value = <none>
-   %7 = tt.expand_dims %1 {axis = 0 : i32}: (tensor<128xi32>) -> tensor<1x128xi32>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [1, 128] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 128], constant_value = <none>
-   %8 = tt.broadcast %6 : (tensor<128x1x!tt.ptr<f32>>) -> tensor<128x128x!tt.ptr<f32>>
--  // CHECK-NEXT: Contiguity: [1, 128] ; Divisibility: [1, 1073741824] ; Constancy: [128, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 128], divisibility = [1, 1073741824], constancy = [128, 1], constant_value = <none>
-   %9 = tt.broadcast %7 : (tensor<1x128xi32>) -> tensor<128x128xi32>
--  // CHECK-NEXT: Contiguity: [1, 128] ; Divisibility: [1, 16] ; Constancy: [1, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 128], divisibility = [1, 16], constancy = [1, 1], constant_value = <none>
-   %10 = tt.addptr %8, %9 : tensor<128x128x!tt.ptr<f32>>, tensor<128x128xi32>
--  // CHECK-NEXT: Contiguity: [128, 1] ; Divisibility: [1073741824, 1] ; Constancy: [1, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128, 1], divisibility = [1073741824, 1], constancy = [1, 1], constant_value = <none>
-   %11 = tt.expand_dims %0 {axis = 1 : i32}: (tensor<128xi32>) -> tensor<128x1xi32>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [128, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [128, 1], constant_value = <none>
-   %12 = tt.splat %arg2 : (!tt.ptr<f32>) -> tensor<128x1x!tt.ptr<f32>>
--  // CHECK-NEXT: Contiguity: [128, 1] ; Divisibility: [16, 1] ; Constancy: [1, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128, 1], divisibility = [16, 1], constancy = [1, 1], constant_value = <none>
-   %13 = tt.addptr %12, %11 : tensor<128x1x!tt.ptr<f32>>, tensor<128x1xi32>
--  // CHECK-NEXT: Contiguity: [1, 128] ; Divisibility: [1, 1073741824] ; Constancy: [1, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 128], divisibility = [1, 1073741824], constancy = [1, 1], constant_value = <none>
-   %14 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [1, 128] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 16], constancy = [1, 128], constant_value = <none>
-   %15 = tt.splat %arg3 : (i32) -> tensor<1x128xi32>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 17179869184] ; Constancy: [1, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 17179869184], constancy = [1, 1], constant_value = <none>
-   %16 = arith.muli %14, %15 : tensor<1x128xi32>
--  // CHECK-NEXT: Contiguity: [128, 1] ; Divisibility: [16, 1] ; Constancy: [1, 128] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128, 1], divisibility = [16, 1], constancy = [1, 128], constant_value = <none>
-   %17 = tt.broadcast %13 : (tensor<128x1x!tt.ptr<f32>>) -> tensor<128x128x!tt.ptr<f32>>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 17179869184] ; Constancy: [128, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [16, 17179869184], constancy = [128, 1], constant_value = <none>
-   %18 = tt.broadcast %16 : (tensor<1x128xi32>) -> tensor<128x128xi32>
--  // CHECK-NEXT: Contiguity: [128, 1] ; Divisibility: [16, 1] ; Constancy: [1, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128, 1], divisibility = [16, 1], constancy = [1, 1], constant_value = <none>
-   %19 = tt.addptr %17, %18 : tensor<128x128x!tt.ptr<f32>>, tensor<128x128xi32>
--  // CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [1, 1] ; Constancy: [1, 1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1, 1], divisibility = [1, 1], constancy = [1, 1], constant_value = <none>
-   %20 = tt.load %10, %cst, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x128xf32>
-   tt.store %19, %20, %cst : tensor<128x128xf32>
-   return
-@@ -347,29 +347,29 @@ func @permute_2d(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32 {t
- module {
- 
- // This is a tiny test for verifying StoreOp-related alignment, It simply store a constant to a buffer.
--// CHECK-LABEL: store_constant_align
--func @store_constant_align(%addr: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %n: i32 {tt.divisibility = 16 : i32}) {
--  // CHECK: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None]
-+// CHECK-LABEL: @store_constant_align
-+func.func @store_constant_align(%addr: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %n: i32 {tt.divisibility = 16 : i32}) {
-+  // CHECK: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
-   %pid = tt.get_program_id {axis = 0 : i32} : i32
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [1] ; ConstantValue: [128]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [1], constant_value = 128
-   %c128_i32 = arith.constant 128 : i32
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [1], constant_value = <none>
-   %1 = arith.muli %pid, %c128_i32 : i32
--  // CHECK-NEXT: Contiguity: [128] ; Divisibility: [1073741824] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128], divisibility = [1073741824], constancy = [1], constant_value = <none>
-   %2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
-- // CHECK-NEXT: Contiguity: [1] ; Divisibility: [128] ; Constancy: [128] ; ConstantValue: [None]
-+ // CHECK-NEXT: contiguity = [1], divisibility = [128], constancy = [128], constant_value = <none>
-   %3 = tt.splat %1 : (i32) -> tensor<128xi32>
-- // CHECK-NEXT: Contiguity: [128] ; Divisibility: [128] ; Constancy: [1] ; ConstantValue: [None]
-+ // CHECK-NEXT: contiguity = [128], divisibility = [128], constancy = [1], constant_value = <none>
-   %4 = arith.addi %3, %2 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [16] ; Constancy: [128] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [16], constancy = [128], constant_value = <none>
-   %5 = tt.splat %addr : (!tt.ptr<f32>) -> tensor<128x!tt.ptr<f32>>
--  // CHECK-NEXT: Contiguity: [128] ; Divisibility: [16] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [128], divisibility = [16], constancy = [1], constant_value = <none>
-   %6 = tt.addptr %5, %4 : tensor<128x!tt.ptr<f32>>, tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [16] ; Constancy: [128] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [16], constancy = [128], constant_value = <none>
-   %9 = tt.splat %n : (i32) -> tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [16] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>
-   %mask = arith.cmpi slt, %4, %9 : tensor<128xi32>
--  // CHECK-NEXT: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None]
-+  // CHECK-NEXT: contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
-   %cst = arith.constant dense<0.0> : tensor<128xf32>
-   tt.store %5, %cst, %mask : tensor<128xf32>
-   return
-@@ -381,8 +381,8 @@ func @store_constant_align(%addr: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %n:
- 
- // This IR is dumped from vecadd test.
- // Note, the hint {tt.divisibility = 16 : i32} for %n_elements affects the alignment of mask.
--// CHECK-LABEL: vecadd_mask_align_16
--func @vecadd_mask_align_16(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %n_elements: i32 {tt.divisibility = 16 : i32}) {
-+// CHECK-LABEL: @vecadd_mask_align_16
-+func.func @vecadd_mask_align_16(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %n_elements: i32 {tt.divisibility = 16 : i32}) {
-   %c64_i32 = arith.constant 64 : i32
-   %0 = tt.get_program_id {axis = 0 : i32} : i32
-   %1 = arith.muli %0, %c64_i32 : i32
-@@ -394,13 +394,13 @@ func @vecadd_mask_align_16(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %ar
-   %7 = tt.splat %arg1 : (!tt.ptr<f32>) -> tensor<64x!tt.ptr<f32>>
-   %8 = tt.addptr %7, %4 : tensor<64x!tt.ptr<f32>>, tensor<64xi32>
-   %9 = tt.splat %n_elements : (i32) -> tensor<64xi32>
--  // CHECK: Contiguity: [1] ; Divisibility: [1] ; Constancy: [16] ; ConstantValue: [None] ( %{{.*}} = arith.cmpi slt, %{{.*}}, %{{.*}} : tensor<64xi32> )
-+  // CHECK: arith.cmpi slt, %{{.*}} => contiguity = [1], divisibility = [1], constancy = [16], constant_value = <none>
-   %mask = arith.cmpi slt, %4, %9 : tensor<64xi32>
-   %11 = tt.load %6, %mask {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64xf32>
-   %12 = tt.load %8, %mask {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64xf32>
-   %13 = arith.addf %11, %12 : tensor<64xf32>
-   %14 = tt.splat %arg2 : (!tt.ptr<f32>) -> tensor<64x!tt.ptr<f32>>
--  // CHECK: Contiguity: [64] ; Divisibility: [16] ; Constancy: [1] ; ConstantValue: [None] ( %{{.*}} = tt.addptr %{{.*}}, %{{.*}} : tensor<64x!tt.ptr<f32>>, tensor<64xi32> )
-+  // CHECK: tt.addptr %{{.*}} => contiguity = [64], divisibility = [16], constancy = [1], constant_value = <none>
-   %15 = tt.addptr %14, %4 : tensor<64x!tt.ptr<f32>>, tensor<64xi32>
-   tt.store %15, %13, %mask : tensor<64xf32>
-   return
-@@ -410,8 +410,8 @@ func @vecadd_mask_align_16(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %ar
- 
- // This IR is dumped from vecadd test.
- // Note, there is no divisibility hint for %n_elements, Triton should assume its divisibility to be 1 by default.
--// CHECK-LABEL: vecadd_mask_align_1
--func @vecadd_mask_align_1(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %n_elements: i32) {
-+// CHECK-LABEL: @vecadd_mask_align_1
-+func.func @vecadd_mask_align_1(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %n_elements: i32) {
-   %c64_i32 = arith.constant 64 : i32
-   %0 = tt.get_program_id {axis = 0 : i32} : i32
-   %1 = arith.muli %0, %c64_i32 : i32
-@@ -423,7 +423,7 @@ func @vecadd_mask_align_1(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg
-   %7 = tt.splat %arg1 : (!tt.ptr<f32>) -> tensor<64x!tt.ptr<f32>>
-   %8 = tt.addptr %7, %4 : tensor<64x!tt.ptr<f32>>, tensor<64xi32>
-   %9 = tt.splat %n_elements : (i32) -> tensor<64xi32>
--  // CHECK: Contiguity: [1] ; Divisibility: [1] ; Constancy: [1] ; ConstantValue: [None] ( %{{.*}} = arith.cmpi slt, %{{.*}}, %{{.*}} : tensor<64xi32> )
-+  // CHECK: arith.cmpi slt, %{{.*}} => contiguity = [1], divisibility = [1], constancy = [1], constant_value = <none>
-   %10 = arith.cmpi slt, %4, %9 : tensor<64xi32>
-   %11 = tt.load %6, %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64xf32>
-   %12 = tt.load %8, %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64xf32>
-diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir
-index efb00c404d..f79222aa7b 100644
---- a/test/Analysis/test-allocation.mlir
-+++ b/test/Analysis/test-allocation.mlir
-@@ -13,7 +13,7 @@
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
- 
- // CHECK-LABEL: matmul_loop
--func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-+func.func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-   %a_ptr_init = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<128x32x!tt.ptr<f16>, #AL>
-   %b_ptr_init = tt.broadcast %B : (!tt.ptr<f16>) -> tensor<32x128x!tt.ptr<f16>, #BL>
- 
-@@ -46,7 +46,7 @@ func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B
- 
- // Shared memory is available after a tensor's liveness range ends
- // CHECK-LABEL: reusable
--func @reusable(%A : !tt.ptr<f16>) {
-+func.func @reusable(%A : !tt.ptr<f16>) {
-   %cst1 = arith.constant dense<true> : tensor<128x32xi1, #AL>
-   %cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
-   %cst3 = arith.constant dense<true> : tensor<32x128xi1, #AL>
-@@ -78,7 +78,7 @@ func @reusable(%A : !tt.ptr<f16>) {
- // %cst1->%cst4
- // %cst3->%g->%h->%i
- // CHECK-LABEL: preallocate
--func @preallocate(%A : !tt.ptr<f16>) {
-+func.func @preallocate(%A : !tt.ptr<f16>) {
-   // CHECK: offset = 0, size = 512
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   // CHECK-NEXT: offset = 1024, size = 512
-@@ -113,7 +113,7 @@ func @preallocate(%A : !tt.ptr<f16>) {
- 
- // Unused tensors are immediately released
- // CHECK-LABEL: unused
--func @unused(%A : !tt.ptr<f16>) {
-+func.func @unused(%A : !tt.ptr<f16>) {
-   // CHECK: offset = 0, size = 1024
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #A_SHARED>
-   // CHECK-NEXT: offset = 0, size = 512
-@@ -128,7 +128,7 @@ func @unused(%A : !tt.ptr<f16>) {
- 
- // cst0 is alive through the entire function, it cannot be released before the end of the function
- // CHECK-LABEL: longlive
--func @longlive(%A : !tt.ptr<f16>) {
-+func.func @longlive(%A : !tt.ptr<f16>) {
-   // CHECK: offset = 0, size = 512
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   // CHECK-NEXT: offset = 512, size = 512
-@@ -156,7 +156,7 @@ func @longlive(%A : !tt.ptr<f16>) {
- }
- 
- // CHECK-LABEL: alloc
--func @alloc(%A : !tt.ptr<f16>) {
-+func.func @alloc(%A : !tt.ptr<f16>) {
-   // CHECK: offset = 0, size = 512
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #AL>
-@@ -167,7 +167,7 @@ func @alloc(%A : !tt.ptr<f16>) {
- }
- 
- // CHECK-LABEL: scratch
--func @scratch() {
-+func.func @scratch() {
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-   // CHECK: scratch offset = 0, size = 512
-   %b = tt.reduce %cst0 {redOp = 1 : i32, axis = 0 : i32} : tensor<16x16xf16, #AL> -> tensor<16xf16, #sliceAd0>
-@@ -176,7 +176,7 @@ func @scratch() {
- }
- 
- // CHECK-LABEL: trans
--func @trans(%A : !tt.ptr<f16>) {
-+func.func @trans(%A : !tt.ptr<f16>) {
-   // CHECK: offset = 0, size = 1024
-   %tensor = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED>
-   %b = tt.trans %tensor : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED_T>
-@@ -184,7 +184,7 @@ func @trans(%A : !tt.ptr<f16>) {
- }
- 
- // CHECK-LABEL: insert_slice_async
--func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
-+func.func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
-   %a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
-   %mask = tt.splat %i1 : (i1) -> tensor<16x16xi1, #AL>
-   %other = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-@@ -197,7 +197,7 @@ func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
- }
- 
- // CHECK-LABEL: extract_slice
--func @extract_slice(%A : !tt.ptr<f16>) {
-+func.func @extract_slice(%A : !tt.ptr<f16>) {
-   // CHECK: offset = 0, size = 512
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED>
-   %index = arith.constant 0 : index
-@@ -209,7 +209,7 @@ func @extract_slice(%A : !tt.ptr<f16>) {
- // B0 -> (B1) -> B0
- // Memory used by B1 can be reused by B0.
- // CHECK-LABEL: if
--func @if(%i1 : i1) {
-+func.func @if(%i1 : i1) {
-   // CHECK: offset = 0, size = 512
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   // CHECK-NEXT: offset = 512, size = 512
-@@ -233,7 +233,7 @@ func @if(%i1 : i1) {
- // B0 -> (B1) -> (B2) -> B0
- // Memory used by B0 cannot be reused by B1 or B2.
- // CHECK-LABEL: if_else
--func @if_else(%i1 : i1) {
-+func.func @if_else(%i1 : i1) {
-   // CHECK: offset = 0, size = 512
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   // CHECK-NEXT: offset = 512, size = 512
-@@ -260,7 +260,7 @@ func @if_else(%i1 : i1) {
- // Block arguments and yields are memory aliases that do not trigger a new
- // allocation.
- // CHECK-LABEL: for
--func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-+func.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-   // CHECK: offset = 0, size = 8192
-   %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   // CHECK-NEXT: offset = 8192, size = 8192
-@@ -275,7 +275,7 @@ func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.p
- }
- 
- // CHECK-LABEL: for_if_slice
--func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
-+func.func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
-   // CHECK: offset = 0, size = 8192
-   %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   // CHECK-NEXT: offset = 8192, size = 8192
-@@ -296,7 +296,7 @@ func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %
- 
- // c0 cannot be released in the loop
- // CHECK-LABEL: for_use_ancestor
--func @for_use_ancestor(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
-+func.func @for_use_ancestor(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
-   // CHECK: offset = 0, size = 8192
-   %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   // CHECK-NEXT: offset = 8192, size = 8192
-@@ -316,7 +316,7 @@ func @for_use_ancestor(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16
- // a_shared_init, b_shared_init, and c_shared_init's liveness ranges are span over the entire function before cst2.
- // So they cannot be reused by cst0 and cst1, but can be reused by cst2.
- // CHECK-LABEL: for_if_for
--func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
-+func.func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
-   // CHECK: offset = 0, size = 8192
-   %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   // CHECK-NEXT: offset = 8192, size = 8192
-diff --git a/test/Analysis/test-membar.mlir b/test/Analysis/test-membar.mlir
-index 7199e5f53d..17880b2094 100644
---- a/test/Analysis/test-membar.mlir
-+++ b/test/Analysis/test-membar.mlir
-@@ -14,7 +14,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- 
- // CHECK-LABEL: matmul_loop
- // There shouldn't be any membar with the dot op encoding.
--func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-+func.func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-   %a_ptr_init = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<128x32x!tt.ptr<f16>, #AL>
-   %b_ptr_init = tt.broadcast %B : (!tt.ptr<f16>) -> tensor<32x128x!tt.ptr<f16>, #BL>
- 
-@@ -42,7 +42,7 @@ func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B
- }
- 
- // CHECK-LABEL: raw_single_block
--func @raw_single_block(%A : !tt.ptr<f16>) {
-+func.func @raw_single_block(%A : !tt.ptr<f16>) {
-   %cst1 = arith.constant dense<true> : tensor<128x32xi1, #AL>
-   %cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
-   %a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<128x32x!tt.ptr<f16>, #AL>
-@@ -54,7 +54,7 @@ func @raw_single_block(%A : !tt.ptr<f16>) {
- }
- 
- // CHECK-LABEL: war_single_block
--func @war_single_block(%A : !tt.ptr<f16>) {
-+func.func @war_single_block(%A : !tt.ptr<f16>) {
-   %cst1 = arith.constant dense<true> : tensor<128x32xi1, #AL>
-   %cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
-   %a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<128x32x!tt.ptr<f16>, #AL>
-@@ -70,7 +70,7 @@ func @war_single_block(%A : !tt.ptr<f16>) {
- }
- 
- // CHECK-LABEL: scratch
--func @scratch() {
-+func.func @scratch() {
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   // CHECK: Membar 1
-   %a = tt.cat %cst0, %cst0 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED>
-@@ -81,7 +81,7 @@ func @scratch() {
- }
- 
- // CHECK-LABEL: async_wait
--func @async_wait() {
-+func.func @async_wait() {
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   // CHECK: Membar 1
-   %a = tt.cat %cst0, %cst0 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED>
-@@ -92,7 +92,7 @@ func @async_wait() {
- }
- 
- // CHECK-LABEL: alloc
--func @alloc() {
-+func.func @alloc() {
-   %cst0 = triton_gpu.alloc_tensor : tensor<16x16xf16, #A_SHARED>
-   %a = tt.cat %cst0, %cst0 {axis = 0} : (tensor<16x16xf16, #A_SHARED>, tensor<16x16xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED>
-   // CHECK: Membar 2
-@@ -101,7 +101,7 @@ func @alloc() {
- }
- 
- // CHECK-LABEL: extract_slice
--func @extract_slice() {
-+func.func @extract_slice() {
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<1x16x16xf16, #A_SHARED>
-   %index = arith.constant 0 : index
-   %cst1 = tensor.extract_slice %cst0[%index, 0, 0][1, 16, 16][1, 1, 1] : tensor<1x16x16xf16, #A_SHARED> to tensor<16x16xf16, #A_SHARED>
-@@ -113,14 +113,14 @@ func @extract_slice() {
- }
- 
- // CHECK-LABEL: trans
--func @trans() {
-+func.func @trans() {
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED>
-   %b = tt.trans %cst0 : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED_T>
-   return
- }
- 
- // CHECK-LABEL: insert_slice_async
--func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
-+func.func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
-   %a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
-   %mask = tt.splat %i1 : (i1) -> tensor<16x16xi1, #AL>
-   %other = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-@@ -135,7 +135,7 @@ func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
- }
- 
- // CHECK-LABEL: insert_slice
--func @insert_slice(%A : !tt.ptr<f16>, %i1 : i1) {
-+func.func @insert_slice(%A : !tt.ptr<f16>, %i1 : i1) {
-   %a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
-   %mask = tt.splat %i1 : (i1) -> tensor<16x16xi1, #AL>
-   %other = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
-@@ -153,7 +153,7 @@ func @insert_slice(%A : !tt.ptr<f16>, %i1 : i1) {
- 
- // If branch inserted a barrier for %cst0 and %cst1, but else didn't, then the barrier should be inserted in the parent region
- // CHECK-LABEL: multi_blocks
--func @multi_blocks(%i1 : i1) {
-+func.func @multi_blocks(%i1 : i1) {
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   scf.if %i1 {
-@@ -174,7 +174,7 @@ func @multi_blocks(%i1 : i1) {
- 
- // Both branches inserted a barrier for %cst0 and %cst1, then the barrier doesn't need to be inserted in the parent region
- // CHECK-LABEL: multi_blocks_join_barrier
--func @multi_blocks_join_barrier(%i1 : i1) {
-+func.func @multi_blocks_join_barrier(%i1 : i1) {
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   scf.if %i1 {
-@@ -192,7 +192,7 @@ func @multi_blocks_join_barrier(%i1 : i1) {
- 
- // Read yielded tensor requires a barrier
- // CHECK-LABEL: multi_blocks_yield
--func @multi_blocks_yield(%i1 : i1) {
-+func.func @multi_blocks_yield(%i1 : i1) {
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   %a = scf.if %i1 -> (tensor<32x16xf16, #A_SHARED>) {
-@@ -212,7 +212,7 @@ func @multi_blocks_yield(%i1 : i1) {
- 
- // Conservatively add a barrier as if the branch (%i1) is never taken
- // CHECK-LABEL: multi_blocks_noelse
--func @multi_blocks_noelse(%i1 : i1) {
-+func.func @multi_blocks_noelse(%i1 : i1) {
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   scf.if %i1 {
-@@ -226,7 +226,7 @@ func @multi_blocks_noelse(%i1 : i1) {
- 
- // Conservatively add a barrier as if the branch (%i2) is never taken
- // CHECK-LABEL: multi_blocks_nested_scf
--func @multi_blocks_nested_scf(%i1 : i1, %i2 : i1) {
-+func.func @multi_blocks_nested_scf(%i1 : i1, %i2 : i1) {
-   %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   %cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A_SHARED>
-   scf.if %i1 {
-@@ -247,7 +247,7 @@ func @multi_blocks_nested_scf(%i1 : i1, %i2 : i1) {
- }
- 
- // CHECK-LABEL: for
--func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-+func.func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-   %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-@@ -262,7 +262,7 @@ func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.p
- // Although a_shared and b_shared are synced before entering the loop,
- // they are reassociated with aliases (c_shared) and thus require a barrier.
- // CHECK-LABEL: for_alias
--func @for_alias(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-+func.func @for_alias(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-   %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   // CHECK-NEXT: Membar 2
-@@ -282,7 +282,7 @@ func @for_alias(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B :
- // Although cst2 is not an argument of scf.yield, its memory is reused by cst1.
- // So we need a barrier both before and after cst1
- // CHECK-LABEL: for_reuse
--func @for_reuse(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-+func.func @for_reuse(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-   %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   // CHECK-NEXT: Membar 2
-@@ -302,7 +302,7 @@ func @for_reuse(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B :
- 
- 
- // CHECK-LABEL: for_reuse_nested
--func @for_reuse_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-+func.func @for_reuse_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-   %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
-   // CHECK-NEXT: Membar 2
-diff --git a/test/Conversion/triton_ops.mlir b/test/Conversion/triton_ops.mlir
-index e9ee502435..0e979b148d 100644
---- a/test/Conversion/triton_ops.mlir
-+++ b/test/Conversion/triton_ops.mlir
-@@ -1,6 +1,6 @@
- // RUN: triton-opt %s | FileCheck %s
- 
--func @cast_ops(%scalar_ptr: !tt.ptr<f32>, %scalar_f32: f32, %scalar_i64: i64) {
-+func.func @cast_ops(%scalar_ptr: !tt.ptr<f32>, %scalar_f32: f32, %scalar_i64: i64) {
-   // scalar -> scalar
-   // CHECK:  i64 -> !tt.ptr<f32>
-   %0 = tt.int_to_ptr %scalar_i64 : i64 -> !tt.ptr<f32>
-@@ -35,7 +35,7 @@ func @cast_ops(%scalar_ptr: !tt.ptr<f32>, %scalar_f32: f32, %scalar_i64: i64) {
-   return
- }
- 
--func @addptr_ops(%scalar_ptr: !tt.ptr<f32>, %scalar_i32: i32) {
-+func.func @addptr_ops(%scalar_ptr: !tt.ptr<f32>, %scalar_i32: i32) {
-   // scalar -> scalar
-   // CHECK: !tt.ptr<f32>
-   %0 = tt.addptr %scalar_ptr, %scalar_i32 : !tt.ptr<f32>, i32
-@@ -54,7 +54,7 @@ func @addptr_ops(%scalar_ptr: !tt.ptr<f32>, %scalar_i32: i32) {
-   return
- }
- 
--func @load_store_ops_scalar(%ptr: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %mask : i1) {
-+func.func @load_store_ops_scalar(%ptr: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %mask : i1) {
-   // Test if Load/Store ops can handle scalar values
-   %other = arith.constant 0.0e+0 : f32
- 
-@@ -76,7 +76,7 @@ func @load_store_ops_scalar(%ptr: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %ma
-   return
- }
- 
--func @reduce_ops_infer(%ptr: !tt.ptr<f32>, %v : tensor<1x2x4xf32>) {
-+func.func @reduce_ops_infer(%ptr: !tt.ptr<f32>, %v : tensor<1x2x4xf32>) {
-   // Test if reduce ops infer types correctly
- 
-   // CHECK: %{{.*}} = tt.reduce %{{.*}} -> tensor<2x4xf32>
-@@ -101,7 +101,7 @@ func @reduce_ops_infer(%ptr: !tt.ptr<f32>, %v : tensor<1x2x4xf32>) {
-   return
- }
- 
--func @dot_ops_infer(%ptr: !tt.ptr<f32>, %v : f32) {
-+func.func @dot_ops_infer(%ptr: !tt.ptr<f32>, %v : f32) {
-   // Test if reduce ops infer types correctly
-   %v128x32 = tt.splat %v : (f32) -> tensor<128x32xf32>
-   %v32x128 = tt.splat %v : (f32) -> tensor<32x128xf32>
-diff --git a/test/Conversion/triton_to_tritongpu.mlir b/test/Conversion/triton_to_tritongpu.mlir
-index a160bc8815..b461ca542f 100644
---- a/test/Conversion/triton_to_tritongpu.mlir
-+++ b/test/Conversion/triton_to_tritongpu.mlir
-@@ -1,6 +1,6 @@
- // RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu=num-warps=2 | FileCheck %s
- 
--func @ops() {
-+func.func @ops() {
-   // CHECK: module attributes {"triton_gpu.num-warps" = 2 : i32} {{.*}}
-   %a = arith.constant dense<1.00e+00> : tensor<128x32xf16>
-   %b = arith.constant dense<2.00e+00> : tensor<32x128xf16>
-@@ -11,7 +11,7 @@ func @ops() {
- 
- // -----
- 
--func @load_ops(%ptr: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
-+func.func @load_ops(%ptr: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
-   // Test if LoadOp is lowered properly (see #771)
-   %ptrs = tt.splat %ptr : (!tt.ptr<f32>) -> tensor<128x!tt.ptr<f32>>
-   %mask = arith.constant dense<true> : tensor<128xi1>
-@@ -30,7 +30,7 @@ func @load_ops(%ptr: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
- 
- // -----
- 
--func @reduce_ops(%ptr: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
-+func.func @reduce_ops(%ptr: !tt.ptr<f32> {tt.divisibility = 16 : i32}) {
-   // Test if the total number of threadsPerWarp is 32
-   // Test if the total number of warps is 2
-   // CHECK: #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 2], order = [0, 1]}>
-diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
-index e9e7d5a340..507b362c99 100644
---- a/test/Conversion/tritongpu_to_llvm.mlir
-+++ b/test/Conversion/tritongpu_to_llvm.mlir
-@@ -4,7 +4,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK: llvm.func @test_empty_kernel(%arg0: i32, %arg1: !llvm.ptr<f16, 1>)
-   // Here the 128 comes from the 4 in module attribute multiples 32
-   // CHECK:  attributes {nvvm.kernel = 1 : ui1, nvvm.maxntid = 128 : i32} {{.*}}
--  func @test_empty_kernel(%lb : index, %A : !tt.ptr<f16>) {
-+  func.func @test_empty_kernel(%lb : index, %A : !tt.ptr<f16>) {
-     // CHECK:  llvm.return
-     return
-   }
-@@ -15,7 +15,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: basic_load
--  func @basic_load(%a_ptr_init : tensor<256x!tt.ptr<f32>, #blocked0>, %cst : tensor<256xi1, #blocked0>, %cst_0 : tensor<256xf32, #blocked0>) {
-+  func.func @basic_load(%a_ptr_init : tensor<256x!tt.ptr<f32>, #blocked0>, %cst : tensor<256xi1, #blocked0>, %cst_0 : tensor<256xf32, #blocked0>) {
-     // CHECK: llvm.inline_asm
-     // CHECK: llvm.inline_asm
-     %1 = tt.load %a_ptr_init, %cst, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0>
-@@ -28,7 +28,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: vectorized_load
--  func @vectorized_load(%a_ptr_init : tensor<256x!tt.ptr<f32>, #blocked0>, %cst : tensor<256xi1, #blocked0>, %cst_0 : tensor<256xf32, #blocked0>) {
-+  func.func @vectorized_load(%a_ptr_init : tensor<256x!tt.ptr<f32>, #blocked0>, %cst : tensor<256xi1, #blocked0>, %cst_0 : tensor<256xf32, #blocked0>) {
-     // CHECK: llvm.inline_asm
-     // CHECK-SAME: ld.global.b32
-     // CHECK: llvm.inline_asm
-@@ -43,7 +43,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK-LABEL: vectorized_load_f16
--  func @vectorized_load_f16(%a_ptr_init: tensor<256x!tt.ptr<f16>, #blocked0>, %cst : tensor<256xi1, #blocked0>, %cst_0 : tensor<256xf16, #blocked0>) {
-+  func.func @vectorized_load_f16(%a_ptr_init: tensor<256x!tt.ptr<f16>, #blocked0>, %cst : tensor<256xi1, #blocked0>, %cst_0 : tensor<256xf16, #blocked0>) {
-     // CHECK: llvm.inline_asm
-     // CHECK-SAME: ld.global.b16
-     // CHECK: llvm.inline_asm
-@@ -59,7 +59,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: masked_load_const_other
--  func @masked_load_const_other(%a_ptr_init : tensor<256x!tt.ptr<f32>, #blocked0>, %cst : tensor<256xi1, #blocked0>) {
-+  func.func @masked_load_const_other(%a_ptr_init : tensor<256x!tt.ptr<f32>, #blocked0>, %cst : tensor<256xi1, #blocked0>) {
-     %cst_0 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked0>
-     %1 = tt.load %a_ptr_init, %cst, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0>
-     return
-@@ -72,7 +72,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: masked_load_const_other_vec
--  func @masked_load_const_other_vec(%a_ptr_init : tensor<256x!tt.ptr<f32>, #blocked0>, %cst : tensor<256xi1, #blocked0>) {
-+  func.func @masked_load_const_other_vec(%a_ptr_init : tensor<256x!tt.ptr<f32>, #blocked0>, %cst : tensor<256xi1, #blocked0>) {
-     %cst_0 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked0>
-     %1 = tt.load %a_ptr_init, %cst, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked0>
-     return
-@@ -84,7 +84,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 2 : i32} {
-   // CHECK-LABEL: global_load_store_no_vec
--  func @global_load_store_no_vec(%arg0: !tt.ptr<f32> {tt.divisibility = 4 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 4 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 4 : i32}, %arg3: i32) {
-+  func.func @global_load_store_no_vec(%arg0: !tt.ptr<f32> {tt.divisibility = 4 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 4 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 4 : i32}, %arg3: i32) {
-     %c256_i32 = arith.constant 256 : i32
-     %0 = tt.get_program_id {axis = 0 : i32} : i32
-     %1 = arith.muli %0, %c256_i32 : i32
-@@ -128,7 +128,7 @@ module attributes {"triton_gpu.num-warps" = 2 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 2 : i32} {
-   // CHECK-LABEL: global_load_store_vec4
--  func @global_load_store_vec4(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32) {
-+  func.func @global_load_store_vec4(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32) {
-     %c256_i32 = arith.constant 256 : i32
-     %0 = tt.get_program_id {axis = 0 : i32} : i32
-     %1 = arith.muli %0, %c256_i32 : i32
-@@ -165,7 +165,7 @@ module attributes {"triton_gpu.num-warps" = 2 : i32} {
- #blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0]}>
- // Note, the %n_elements doesn't have a "tt.divisibility" hint, so Triton assumes it's divisibility is 1, this should effect the mask's alignment and further restrict the load/store ops' vector width to be 1.
- module attributes {"triton_gpu.num-warps" = 2 : i32} {
--  func @vecadd_masked_vec1(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %n_elements: i32) {
-+  func.func @vecadd_masked_vec1(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %n_elements: i32) {
-     %c64_i32 = arith.constant 64 : i32
-     %0 = tt.get_program_id {axis = 0 : i32} : i32
-     %1 = arith.muli %0, %c64_i32 : i32
-@@ -195,7 +195,7 @@ module attributes {"triton_gpu.num-warps" = 2 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK-LABEL: global_load_store_vec2
--    func @global_load_store_vec2(%arg0: !tt.ptr<f32> {tt.divisibility = 8 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 8 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 8 : i32}, %arg3: i32) {
-+    func.func @global_load_store_vec2(%arg0: !tt.ptr<f32> {tt.divisibility = 8 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 8 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 8 : i32}, %arg3: i32) {
-     %c256_i32 = arith.constant 256 : i32
-     %0 = tt.get_program_id {axis = 0 : i32} : i32
-     %1 = arith.muli %0, %c256_i32 : i32
-@@ -240,7 +240,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK-LABEL: global_load_store_vec8
--    func @global_load_store_vec8(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32) {
-+    func.func @global_load_store_vec8(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32) {
-     %c256_i32 = arith.constant 256 : i32
-     %0 = tt.get_program_id {axis = 0 : i32} : i32
-     %1 = arith.muli %0, %c256_i32 : i32
-@@ -283,7 +283,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: basic_view_broadcast
--  func @basic_view_broadcast(%arg : tensor<256xf32,#blocked0>) {
-+  func.func @basic_view_broadcast(%arg : tensor<256xf32,#blocked0>) {
-     // CHECK: llvm.mlir.undef
-     // CHECK: %[[T0:.*]] = llvm.extractvalue
-     // CHECK: %[[T1:.*]] = llvm.extractvalue
-@@ -307,7 +307,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: basic_make_range
--  func @basic_make_range() {
-+  func.func @basic_make_range() {
-     // CHECK: nvvm.read.ptx.sreg.tid.x
-     // CHECK: llvm.mlir.undef
-     // CHECK: llvm.insertvalue
-@@ -322,7 +322,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: basic_addf
--  func @basic_addf(%arg0 : tensor<256xf32,#blocked0>, %arg1 : tensor<256xf32,#blocked0>) {
-+  func.func @basic_addf(%arg0 : tensor<256xf32,#blocked0>, %arg1 : tensor<256xf32,#blocked0>) {
-     // CHECK: llvm.fadd
-     // CHECK: llvm.fadd
-     %1 = arith.addf %arg0, %arg1 : tensor<256xf32,#blocked0>
-@@ -335,7 +335,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: basic_addi
--  func @basic_addi(%arg0 : tensor<256xi32,#blocked0>, %arg1 : tensor<256xi32,#blocked0>) {
-+  func.func @basic_addi(%arg0 : tensor<256xi32,#blocked0>, %arg1 : tensor<256xi32,#blocked0>) {
-     // CHECK: llvm.add
-     // CHECK: llvm.add
-     %1 = arith.addi %arg0, %arg1 : tensor<256xi32,#blocked0>
-@@ -347,7 +347,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- 
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: basic_program_id
--  func @basic_program_id() {
-+  func.func @basic_program_id() {
-     // CHECK: nvvm.read.ptx.sreg.ctaid.x : i32
-     %0 = tt.get_program_id {axis = 0 : i32} : i32
-     return
-@@ -359,7 +359,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: basic_addptr
--  func @basic_addptr(%arg0 : tensor<256x!tt.ptr<f32>,#blocked0>, %arg1 : tensor<256xi32,#blocked0>) {
-+  func.func @basic_addptr(%arg0 : tensor<256x!tt.ptr<f32>,#blocked0>, %arg1 : tensor<256xi32,#blocked0>) {
-     // CHECK: llvm.getelementptr
-     // CHECK: llvm.getelementptr
-     %0 = tt.addptr %arg0, %arg1 : tensor<256x!tt.ptr<f32>, #blocked0>, tensor<256xi32, #blocked0>
-@@ -373,7 +373,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK: llvm.mlir.global external @global_smem
-   // CHECK-LABEL: basic_alloc_tensor
--  func @basic_alloc_tensor() {
-+  func.func @basic_alloc_tensor() {
-     // CHECK: llvm.mlir.addressof @global_smem
-     // CHECK-NEXT: llvm.bitcast
-     // CHECK-NEXT: llvm.mlir.constant
-@@ -390,7 +390,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK: llvm.mlir.global external @global_smem
-   // CHECK-LABEL: basic_extract_slice
--  func @basic_extract_slice() {
-+  func.func @basic_extract_slice() {
-     // CHECK: llvm.mlir.addressof @global_smem
-     // CHECK: llvm.extractvalue
-     // CHECK-NEXT: llvm.extractvalue
-@@ -423,7 +423,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- 
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: basic_async_wait
--  func @basic_async_wait() {
-+  func.func @basic_async_wait() {
-     // CHECK: cp.async.wait_group 0x4
-     triton_gpu.async_wait {num = 4: i32}
-     return
-@@ -442,7 +442,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #A = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: basic_insert_slice_async_fallback
--  func @basic_insert_slice_async_fallback(%arg0: !tt.ptr<f16> {tt.divisibility = 1 : i32}) {
-+  func.func @basic_insert_slice_async_fallback(%arg0: !tt.ptr<f16> {tt.divisibility = 1 : i32}) {
-     %off0_ = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #slice2d1>
-     %off1_ = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<64xi32, #slice3d0>
-     %off0 = tt.expand_dims %off0_ {axis = 1 : i32} : (tensor<16xi32, #slice2d1>) -> tensor<16x1xi32, #block2>
-@@ -481,7 +481,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #A = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: basic_insert_slice_async_v4
--  func @basic_insert_slice_async_v4(%arg0: !tt.ptr<f32> {tt.divisibility = 32 : i32}) {
-+  func.func @basic_insert_slice_async_v4(%arg0: !tt.ptr<f32> {tt.divisibility = 32 : i32}) {
-     %off0_ = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #slice2d1>
-     %off1_ = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<64xi32, #slice3d0>
-     %off0 = tt.expand_dims %off0_ {axis = 1 : i32} : (tensor<16xi32, #slice2d1>) -> tensor<16x1xi32, #block2>
-@@ -523,7 +523,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #A = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 4, order = [1, 0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: basic_insert_slice_async_v1
--  func @basic_insert_slice_async_v1(%arg0: !tt.ptr<f32> {tt.divisibility = 4 : i32}) {
-+  func.func @basic_insert_slice_async_v1(%arg0: !tt.ptr<f32> {tt.divisibility = 4 : i32}) {
-     %off0_ = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #slice2d1>
-     %off1_ = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #slice3d0>
-     %off0 = tt.expand_dims %off0_ {axis = 1 : i32} : (tensor<16xi32, #slice2d1>) -> tensor<16x1xi32, #block2>
-@@ -568,7 +568,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #A = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 4, order = [1, 0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: basic_insert_slice_async_v1_multictas
--  func @basic_insert_slice_async_v1_multictas(%arg0: !tt.ptr<f32> {tt.divisibility = 4 : i32}) {
-+  func.func @basic_insert_slice_async_v1_multictas(%arg0: !tt.ptr<f32> {tt.divisibility = 4 : i32}) {
-     %off0_ = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #slice2d1>
-     %off1_ = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #slice3d0>
-     %off0 = tt.expand_dims %off0_ {axis = 1 : i32} : (tensor<32xi32, #slice2d1>) -> tensor<32x1xi32, #block2>
-@@ -619,7 +619,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK: basic_splat
--  func @basic_splat(%ptr: !tt.ptr<f32>) {
-+  func.func @basic_splat(%ptr: !tt.ptr<f32>) {
-     // CHECK: llvm.mlir.undef
-     // CHECK: llvm.insertvalue
-     // CHECK: llvm.insertvalue
-@@ -633,7 +633,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: basic_store
--  func @basic_store(%ptrs: tensor<256x!tt.ptr<f32>, #blocked0>, %vals: tensor<256xf32, #blocked0>, %mask: tensor<256xi1, #blocked0>) {
-+  func.func @basic_store(%ptrs: tensor<256x!tt.ptr<f32>, #blocked0>, %vals: tensor<256xf32, #blocked0>, %mask: tensor<256xi1, #blocked0>) {
-     // CHECK: llvm.inline_asm
-     // CHECK-SAME: st.global.b32 [ ${{.*}} + 0 ], { ${{.*}} };
-     // CHECK: llvm.inline_asm
-@@ -650,7 +650,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8>
-   // CHECK-LABEL: convert_layout_blocked_blocked
--  func @convert_layout_blocked_blocked(%arg0: tensor<16x16xf32, #blocked0>) {
-+  func.func @convert_layout_blocked_blocked(%arg0: tensor<16x16xf32, #blocked0>) {
-     // CHECK: llvm.mlir.addressof @global_smem
-     // CHECK: llvm.store
-     // CHECK-SAME: !llvm.ptr<vector<1xf32>, 3>
-@@ -697,7 +697,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8>
-   // CHECK-LABEL: convert_layout_blocked_blocked_vec
--  func @convert_layout_blocked_blocked_vec(%arg0: tensor<16x16xf32, #blocked0>) {
-+  func.func @convert_layout_blocked_blocked_vec(%arg0: tensor<16x16xf32, #blocked0>) {
-     // CHECK: llvm.mlir.addressof @global_smem
-     // CHECK: llvm.store
-     // CHECK-SAME: !llvm.ptr<vector<4xf32>, 3>
-@@ -720,7 +720,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8>
-   // CHECK-LABEL: convert_layout_blocked_blocked_multi_rep
--  func @convert_layout_blocked_blocked_multi_rep(%arg0: tensor<16x16xf32, #blocked0>) {
-+  func.func @convert_layout_blocked_blocked_multi_rep(%arg0: tensor<16x16xf32, #blocked0>) {
-     // CHECK: llvm.mlir.addressof @global_smem
-     // CHECK: llvm.store
-     // CHECK-SAME: !llvm.ptr<vector<4xf32>, 3>
-@@ -751,7 +751,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma0}>
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK-LABEL: convert_dot
--  func @convert_dot(%A: tensor<16x16xf16, #blocked0>, %B: tensor<16x16xf16, #blocked0>) {
-+  func.func @convert_dot(%A: tensor<16x16xf16, #blocked0>, %B: tensor<16x16xf16, #blocked0>) {
-     %AA = triton_gpu.convert_layout %A : (tensor<16x16xf16, #blocked0>) -> tensor<16x16xf16, #shared0>
-     %BB = triton_gpu.convert_layout %B : (tensor<16x16xf16, #blocked0>) -> tensor<16x16xf16, #shared0>
-     // CHECK: llvm.inline_asm
-@@ -775,7 +775,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- // TODO: problems in MLIR's parser on slice layout
- // #blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0]}>
- // module attributes {"triton_gpu.num-warps" = 1 : i32} {
--//   func @make_range_sliced_layout() {
-+//   func.func @make_range_sliced_layout() {
- //     %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked0}>>
- //     return
- //   }
-@@ -788,7 +788,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8>
-   // CHECK-LABEL: convert_layout_mmav2_block
--  func @convert_layout_mmav2_blocked(%arg0: tensor<32x16xf32, #mma>) {
-+  func.func @convert_layout_mmav2_blocked(%arg0: tensor<32x16xf32, #mma>) {
-     // CHECK: llvm.store
-     // CHECK-SAME: !llvm.ptr<vector<2xf32>, 3>
-     // CHECK: llvm.store
-@@ -808,7 +808,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8>
-   // CHECK-LABEL: convert_layout_mmav1_block
--  func @convert_layout_mmav1_blocked(%arg0: tensor<32x64xf32, #mma>) {
-+  func.func @convert_layout_mmav1_blocked(%arg0: tensor<32x64xf32, #mma>) {
-     // CHECK: llvm.store
-     // CHECK-SAME: !llvm.ptr<vector<2xf32>, 3>
-     // CHECK: llvm.store
-@@ -831,7 +831,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8>
-   // CHECK-LABEL: convert_layout_blocked_shared
--  func @convert_layout_blocked_shared(%arg0: tensor<128x32xf32, #blocked0>) {
-+  func.func @convert_layout_blocked_shared(%arg0: tensor<128x32xf32, #blocked0>) {
-     // CHECK: llvm.store
-     // CHECK-SAME: !llvm.ptr<vector<8xf32>, 3>
-     // CHECK: llvm.store
-@@ -847,7 +847,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [1, 1], order = [1, 0]}>
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK-LABEL: convert_blocked1d_to_slice0
--  func @convert_blocked1d_to_slice0(%src:tensor<32xi32, #blocked0>) {
-+  func.func @convert_blocked1d_to_slice0(%src:tensor<32xi32, #blocked0>) {
-     // CHECK-COUNT-4: llvm.load {{.*}} : !llvm.ptr<vector<1xi32>, 3>
-     %cvt = triton_gpu.convert_layout %src : (tensor<32xi32, #blocked0>) -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
-     return
-@@ -860,7 +860,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [1, 1], order = [1, 0]}>
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK-LABEL: convert_blocked1d_to_slice1
--  func @convert_blocked1d_to_slice1(%src:tensor<32xi32, #blocked0>) {
-+  func.func @convert_blocked1d_to_slice1(%src:tensor<32xi32, #blocked0>) {
-     // CHECK-COUNT-32: llvm.load {{.*}} : !llvm.ptr<vector<1xi32>, 3>
-     %cvt = triton_gpu.convert_layout %src : (tensor<32xi32, #blocked0>) -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
-     return
-@@ -873,7 +873,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- #blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK-LABEL: convert_blocked_to_blocked_ptr
--  func @convert_blocked_to_blocked_ptr(%src:tensor<32x!tt.ptr<f32>, #blocked0>) {
-+  func.func @convert_blocked_to_blocked_ptr(%src:tensor<32x!tt.ptr<f32>, #blocked0>) {
-     // CHECK: llvm.ptrtoint
-     // CHECK: llvm.store
-     // CHECK: nvvm.barrier0
-@@ -892,7 +892,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- #dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#mma}>
- #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
--  func @matmul_kernel_dot_operand_layout(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
-+  func.func @matmul_kernel_dot_operand_layout(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
-   %a:tensor<128x32xf16, #shared>, %b:tensor<32x256xf16, #shared>) {
-     %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma>
-     // CHECK: ldmatrix.sync.aligned.m8n8.x4.shared.b16
-@@ -918,7 +918,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#mma, isMMAv1Row=true}>
- #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma, isMMAv1Row=true}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
--  func @matmul884_kernel_dot_operand_layout(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
-+  func.func @matmul884_kernel_dot_operand_layout(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
-   %a:tensor<32x64xf16, #shared0>, %b:tensor<64x64xf16, #shared1>) {
-     %cst = arith.constant dense<0.000000e+00> : tensor<32x64xf32, #mma>
-     // CHECK: ldmatrix.sync.aligned.m8n8.x4.shared.b16
-@@ -941,7 +941,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #dot_operand_a = #triton_gpu.dot_op<{opIdx=0, parent=#blocked}>
- #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#blocked}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
--  func @matmul_fmadot(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
-+  func.func @matmul_fmadot(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
-   %a:tensor<32x16xf32, #shared>, %b:tensor<16x32xf32, #shared>) {
-     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked>
-     // CHECK: llvm.intr.fmuladd
-@@ -965,7 +965,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #dot_operand_b = #triton_gpu.dot_op<{opIdx=1, parent=#mma}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: matmul_tf32dot
--  func @matmul_tf32dot(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
-+  func.func @matmul_tf32dot(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
-   %a:tensor<32x16xf32, #shared>, %b:tensor<16x32xf32, #shared>) {
-     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
-     // CHECK: llvm.inline_asm
-@@ -1000,7 +1000,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: atomic_add_f32
--  func @atomic_add_f32(%arg0 : tensor<256x!tt.ptr<f32>, #blocked0>, %arg1 : tensor<256xi1, #blocked0>, %arg2 : tensor<256xf32, #blocked0>) {
-+  func.func @atomic_add_f32(%arg0 : tensor<256x!tt.ptr<f32>, #blocked0>, %arg1 : tensor<256xi1, #blocked0>, %arg2 : tensor<256xf32, #blocked0>) {
-     // CHECK: llvm.inline_asm
-     // CHECK-SAME: atom.global.gpu.add.f32
-     %0 = "tt.atomic_rmw" (%arg0, %arg2, %arg1) {atomic_rmw_op = 5 : i32} : (tensor<256x!tt.ptr<f32>, #blocked0>, tensor<256xf32, #blocked0>, tensor<256xi1, #blocked0>) -> tensor<256xf32, #blocked0>
-@@ -1012,7 +1012,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
- 
--func @test_get_program_id(%a: tensor<32x!tt.ptr<i32>, #blocked0>) {
-+func.func @test_get_program_id(%a: tensor<32x!tt.ptr<i32>, #blocked0>) {
-   %blockidx = tt.get_program_id {axis=0:i32} : i32
-   %blockidy = tt.get_program_id {axis=1:i32} : i32
-   %blockidz = tt.get_program_id {axis=2:i32} : i32
-@@ -1032,7 +1032,7 @@ func @test_get_program_id(%a: tensor<32x!tt.ptr<i32>, #blocked0>) {
- // -----
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
--  func @test_get_num_program(%a: tensor<32x!tt.ptr<i32>, #blocked0>) {
-+  func.func @test_get_num_program(%a: tensor<32x!tt.ptr<i32>, #blocked0>) {
-     // CHECK: nvvm.read.ptx.sreg.nctaid.x
-     // CHECK: nvvm.read.ptx.sreg.nctaid.y
-     // CHECK: nvvm.read.ptx.sreg.nctaid.z
-@@ -1052,7 +1052,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #blocked0 = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}>
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
-   // CHECK-LABEL: test_index_cache
--  func @test_index_cache() {
-+  func.func @test_index_cache() {
-     // CHECK: nvvm.read.ptx.sreg.tid.x
-     %0 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked0>
-     // CHECK-NOT: nvvm.read.ptx.sreg.tid.x
-@@ -1066,7 +1066,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- #shared0 = #triton_gpu.shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0]}>
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK-LABEL: test_base_index_cache
--  func @test_base_index_cache(%arg0: tensor<128x32xf32, #blocked0>) {
-+  func.func @test_base_index_cache(%arg0: tensor<128x32xf32, #blocked0>) {
-     // CHECK: nvvm.read.ptx.sreg.tid.x
-     %0 = triton_gpu.convert_layout %arg0 : (tensor<128x32xf32, #blocked0>) -> tensor<128x32xf32, #shared0>
-     // CHECK-NOT: nvvm.read.ptx.sreg.tid.x
-@@ -1080,7 +1080,7 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
- #shared0 = #triton_gpu.shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0]}>
- module attributes {"triton_gpu.num-warps" = 1 : i32} {
-   // CHECK-LABEL: test_index_cache_different_block
--  func @test_index_cache_different_block(%arg0: tensor<128x32xf32, #blocked0>, %arg1: i1) {
-+  func.func @test_index_cache_different_block(%arg0: tensor<128x32xf32, #blocked0>, %arg1: i1) {
-     // CHECK: nvvm.read.ptx.sreg.tid.x
-     %0 = triton_gpu.convert_layout %arg0 : (tensor<128x32xf32, #blocked0>) -> tensor<128x32xf32, #shared0>
-     scf.if %arg1 {
-diff --git a/test/Target/tritongpu_to_llvmir.mlir b/test/Target/tritongpu_to_llvmir.mlir
-index cafff3ca60..114d3a9eb2 100644
---- a/test/Target/tritongpu_to_llvmir.mlir
-+++ b/test/Target/tritongpu_to_llvmir.mlir
-@@ -4,11 +4,11 @@
- // CHECK-LABEL: ; ModuleID = 'LLVMDialectModule'
- // CHECK: define void @test_empty_kernel
- // CHECK: !nvvm.annotations
--// CHECK: !{void (i32, half addrspace(1)*)* @test_empty_kernel, !"maxntidx", i32 128}
-+// CHECK: !{ptr @test_empty_kernel, !"maxntidx", i32 128}
- 
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
- 
--func @test_empty_kernel(%lb : index, %A : !tt.ptr<f16>) {
-+func.func @test_empty_kernel(%lb : index, %A : !tt.ptr<f16>) {
- 
-   return
- }
-diff --git a/test/Target/tritongpu_to_ptx.mlir b/test/Target/tritongpu_to_ptx.mlir
-index 404e970a29..12742ad9e2 100644
---- a/test/Target/tritongpu_to_ptx.mlir
-+++ b/test/Target/tritongpu_to_ptx.mlir
-@@ -6,7 +6,7 @@
- 
- module attributes {"triton_gpu.num-warps" = 4 : i32} {
- 
--func @test_empty_kernel(%lb : index, %A : !tt.ptr<f16>) {
-+func.func @test_empty_kernel(%lb : index, %A : !tt.ptr<f16>) {
- 
-   return
- }
-diff --git a/test/Triton/combine.mlir b/test/Triton/combine.mlir
-index 050a3f7565..5ef6790e69 100644
---- a/test/Triton/combine.mlir
-+++ b/test/Triton/combine.mlir
-@@ -2,10 +2,10 @@
- // RUN: triton-opt %s -split-input-file -canonicalize -triton-combine | FileCheck %s
- 
- // CHECK-LABEL: @test_combine_dot_add_pattern
--func @test_combine_dot_add_pattern() -> (tensor<128x128xf32>, tensor<128x128xf32>) {
--    // CHECK: %[[d:.*]] = arith.constant dense<3.000000e+00> : tensor<128x128xf32>
--    // CHECK: %[[b:.*]] = arith.constant dense<2.000000e+00> : tensor<128x128xf32>
--    // CHECK: %[[a:.*]] = arith.constant dense<1.000000e+00> : tensor<128x128xf32>
-+func.func @test_combine_dot_add_pattern() -> (tensor<128x128xf32>, tensor<128x128xf32>) {
-+    // CHECK-DAG: %[[d:.*]] = arith.constant dense<3.000000e+00> : tensor<128x128xf32>
-+    // CHECK-DAG: %[[b:.*]] = arith.constant dense<2.000000e+00> : tensor<128x128xf32>
-+    // CHECK-DAG: %[[a:.*]] = arith.constant dense<1.000000e+00> : tensor<128x128xf32>
-     %a = arith.constant dense<1.0> : tensor<128x128xf32>
-     %b = arith.constant dense<2.0> : tensor<128x128xf32>
-     %zero = arith.constant dense<0.0> : tensor<128x128xf32>
-@@ -24,7 +24,7 @@ func @test_combine_dot_add_pattern() -> (tensor<128x128xf32>, tensor<128x128xf32
- 
- 
- // COM: CHECK-LABEL: @test_combine_addptr_pattern
--func @test_combine_addptr_pattern(%base: !tt.ptr<f32>) -> tensor<8x!tt.ptr<f32>> {
-+func.func @test_combine_addptr_pattern(%base: !tt.ptr<f32>) -> tensor<8x!tt.ptr<f32>> {
-     %off0 = arith.constant 10 : i32
-     %off1 = arith.constant 15 : i32
- 
-@@ -47,46 +47,46 @@ func @test_combine_addptr_pattern(%base: !tt.ptr<f32>) -> tensor<8x!tt.ptr<f32>>
- 
- 
- // CHECK-LABEL: @test_combine_select_masked_load_pattern
--func @test_combine_select_masked_load_pattern(%ptr: tensor<8x!tt.ptr<f32>>, %cond: i1) -> (tensor<8xf32>, tensor<8xf32>) {
-+func.func @test_combine_select_masked_load_pattern(%ptr: tensor<8x!tt.ptr<f32>>, %cond: i1) -> (tensor<8xf32>, tensor<8xf32>) {
-     %mask = tt.broadcast %cond : (i1) -> tensor<8xi1>
-     %false_val = arith.constant dense<0.0> : tensor<8xf32>
- 
-     // CHECK: %[[res1:.*]] = tt.load %{{.*}}, %{{.*}}, %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32>
-     %x = tt.load %ptr, %mask, %false_val {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32>
--    %0 = select %cond, %x, %false_val : tensor<8xf32>
-+    %0 = arith.select %cond, %x, %false_val : tensor<8xf32>
- 
-     // CHECK: %[[res2:.*]] = tt.load %{{.*}}, %{{.*}}, %{{.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32>
-     %y = tt.load %ptr, %mask, %false_val {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32>
--    %1 = select %cond, %y, %false_val : tensor<8xf32>
-+    %1 = arith.select %cond, %y, %false_val : tensor<8xf32>
- 
-     // CHECK: return %[[res1]], %[[res2]] : tensor<8xf32>, tensor<8xf32>
-     return %0, %1 : tensor<8xf32>, tensor<8xf32>
- }
- 
- // CHECK-LABEL: @test_combine_select_masked_load_fail_pattern
--func @test_combine_select_masked_load_fail_pattern(%ptr: tensor<8x!tt.ptr<f32>>, %dummy_load: tensor<8xf32>, %dummy_broadcast: tensor<8xi1>, %cond0: i1, %cond1: i1) -> (tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) {
-+func.func @test_combine_select_masked_load_fail_pattern(%ptr: tensor<8x!tt.ptr<f32>>, %dummy_load: tensor<8xf32>, %dummy_broadcast: tensor<8xi1>, %cond0: i1, %cond1: i1) -> (tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) {
-     %false_val = arith.constant dense<0.0> : tensor<8xf32>
- 
-     // Case 1: value at the "load" position is not an "op".  Select should not be canonicalized.
--    // CHECK: %{{.*}} = select %{{.*}}, %{{.*}}, %{{.*}} : tensor<8xf32>
--    %0 = select %cond0, %dummy_load, %false_val : tensor<8xf32>
-+    // CHECK: %{{.*}} = arith.select %{{.*}}, %{{.*}}, %{{.*}} : tensor<8xf32>
-+    %0 = arith.select %cond0, %dummy_load, %false_val : tensor<8xf32>
- 
-     // Case 2: value at the "broadcast" position is not an "op".  Select should not be canonicalized.
-     %real_load0 = tt.load %ptr, %dummy_broadcast, %false_val {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32>
--    // CHECK: %{{.*}} = select %{{.*}}, %{{.*}}, %{{.*}} : tensor<8xf32>
--    %1 = select %cond0, %real_load0, %false_val : tensor<8xf32>
-+    // CHECK: %{{.*}} = arith.select %{{.*}}, %{{.*}}, %{{.*}} : tensor<8xf32>
-+    %1 = arith.select %cond0, %real_load0, %false_val : tensor<8xf32>
- 
-     // Case 3: condition of "broadcast" is not the same as the condition of "select".  Select should not be canonicalized.
-     %cond0_ = tt.broadcast %cond0 : (i1) -> tensor<8xi1>
-     %real_load1 = tt.load %ptr, %cond0_, %false_val {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<8xf32>
--    // CHECK: %{{.*}} = select %{{.*}}, %{{.*}}, %{{.*}} : tensor<8xf32>
--    %2 = select %cond1, %real_load1, %false_val : tensor<8xf32>
-+    // CHECK: %{{.*}} = arith.select %{{.*}}, %{{.*}}, %{{.*}} : tensor<8xf32>
-+    %2 = arith.select %cond1, %real_load1, %false_val : tensor<8xf32>
- 
-     return %0, %1, %2 : tensor<8xf32>, tensor<8xf32>, tensor<8xf32>
- }
- 
- // CHECK-LABEL: @test_combine_broadcast_constant_pattern
--func @test_combine_broadcast_constant_pattern(%cst : f32) -> tensor<8x2xf32> {
-+func.func @test_combine_broadcast_constant_pattern(%cst : f32) -> tensor<8x2xf32> {
-     // CHECK: %[[cst:.*]] = arith.constant dense<1.000000e+00> : tensor<8x2xf32>
-     %const = arith.constant dense<1.0> : tensor<8xf32>
-     %bst_out = tt.broadcast %const : (tensor<8xf32>) -> tensor<8x2xf32>
-@@ -96,7 +96,7 @@ func @test_combine_broadcast_constant_pattern(%cst : f32) -> tensor<8x2xf32> {
- }
- 
- // CHECK-LABEL: @test_canonicalize_masked_load_pattern
--func @test_canonicalize_masked_load_pattern(%ptr: tensor<8x!tt.ptr<f32>>) -> (tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) {
-+func.func @test_canonicalize_masked_load_pattern(%ptr: tensor<8x!tt.ptr<f32>>) -> (tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) {
-     %true_mask = arith.constant dense<true> : tensor<8xi1>
-     %false_mask = arith.constant dense<false> : tensor<8xi1>
-     %other_val = arith.constant dense<0.0> : tensor<8xf32>
-@@ -117,7 +117,7 @@ func @test_canonicalize_masked_load_pattern(%ptr: tensor<8x!tt.ptr<f32>>) -> (te
- }
- 
- // CHECK-LABEL: @test_canonicalize_masked_load_fail_pattern
--func @test_canonicalize_masked_load_fail_pattern(%ptr: tensor<8x!tt.ptr<f32>>, %mask: tensor<8xi1>) -> (tensor<8xf32>, tensor<8xf32>) {
-+func.func @test_canonicalize_masked_load_fail_pattern(%ptr: tensor<8x!tt.ptr<f32>>, %mask: tensor<8xi1>) -> (tensor<8xf32>, tensor<8xf32>) {
-     %other_val = arith.constant dense<0.0> : tensor<8xf32>
- 
-     // Case: value at the "mask" position is not an "op".  Load should not be canonicalized.
-@@ -130,7 +130,7 @@ func @test_canonicalize_masked_load_fail_pattern(%ptr: tensor<8x!tt.ptr<f32>>, %
- }
- 
- // CHECK-LABEL: @test_canonicalize_masked_store_pattern
--func @test_canonicalize_masked_store_pattern(%ptr: tensor<8x!tt.ptr<f32>>, %val: tensor<8xf32>) {
-+func.func @test_canonicalize_masked_store_pattern(%ptr: tensor<8x!tt.ptr<f32>>, %val: tensor<8xf32>) {
-     %true_mask = arith.constant dense<true> : tensor<8xi1>
-     %false_mask = arith.constant dense<false> : tensor<8xi1>
- 
-@@ -144,7 +144,7 @@ func @test_canonicalize_masked_store_pattern(%ptr: tensor<8x!tt.ptr<f32>>, %val:
- }
- 
- // CHECK-LABEL: @test_canonicalize_masked_store_fail_pattern
--func @test_canonicalize_masked_store_fail_pattern(%ptr: tensor<8x!tt.ptr<f32>>, %val: tensor<8xf32>, %mask: tensor<8xi1>) {
-+func.func @test_canonicalize_masked_store_fail_pattern(%ptr: tensor<8x!tt.ptr<f32>>, %val: tensor<8xf32>, %mask: tensor<8xi1>) {
-     // Case: value at the "mask" position is not an "op".  Store should not be canonicalized.
-     // CHECK: tt.store %{{.*}}, %{{.*}}, %{{.*}} : tensor<8xf32>
-     tt.store %ptr, %val, %mask : tensor<8xf32>
-diff --git a/test/Triton/vecadd.mlir b/test/Triton/vecadd.mlir
-index 0b69ef3054..f5019b1cdd 100644
---- a/test/Triton/vecadd.mlir
-+++ b/test/Triton/vecadd.mlir
-@@ -1,7 +1,7 @@
- // RUN: triton-opt %s -verify-diagnostics
- 
- module {
--  func @add_kernel__Pfp32_Pfp32_Pfp32_i32_i32_i32__(%arg0: !tt.ptr<f32>, %arg1: !tt.ptr<f32>, %arg2: !tt.ptr<f32>, %arg3: i32, %arg4: i32, %arg5: i32) {
-+  func.func @add_kernel__Pfp32_Pfp32_Pfp32_i32_i32_i32__(%arg0: !tt.ptr<f32>, %arg1: !tt.ptr<f32>, %arg2: !tt.ptr<f32>, %arg3: i32, %arg4: i32, %arg5: i32) {
-     %0 = tt.get_program_id {axis = 0 : i32} : i32
-     %c256_i32 = arith.constant 256 : i32
-     %1 = arith.muli %0, %c256_i32 : i32
-@@ -43,7 +43,7 @@ module {
-   }
- }
- // module {
--//   func @add_kernel__Pfp32_Pfp32_Pfp32_i32_i32_i32__(%arg0: !tt.ptr<f32>, %arg1: !tt.ptr<f32>, %arg2: !tt.ptr<f32>, %arg3: i32, %arg4: i32, %arg5: i32) {
-+//   func.func @add_kernel__Pfp32_Pfp32_Pfp32_i32_i32_i32__(%arg0: !tt.ptr<f32>, %arg1: !tt.ptr<f32>, %arg2: !tt.ptr<f32>, %arg3: i32, %arg4: i32, %arg5: i32) {
- //     %c64 = arith.constant 64 : index
- //     %c32 = arith.constant 32 : index
- //     %c0 = arith.constant 0 : index
-diff --git a/test/TritonGPU/coalesce.mlir b/test/TritonGPU/coalesce.mlir
-index 60e359f527..51cccccfbd 100644
---- a/test/TritonGPU/coalesce.mlir
-+++ b/test/TritonGPU/coalesce.mlir
-@@ -19,7 +19,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32} {
- // CHECK: [[store_val:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xf32, [[col_layout]]>
- // CHECK: [[store_mask:%.*]] = triton_gpu.convert_layout {{.*}} -> tensor<64x64xi1, [[col_layout]]>
- // CHECK: tt.store [[store_ptr]], [[store_val]], [[store_mask]]
--func @transpose(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32},
-+func.func @transpose(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32},
-                 %arg1: i32 {tt.divisibility = 16 : i32},
-                 %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32},
-                 %arg3: i32 {tt.divisibility = 16 : i32}) {
-diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir
-index 2c009ffa48..7e9cb9d504 100644
---- a/test/TritonGPU/combine.mlir
-+++ b/test/TritonGPU/combine.mlir
-@@ -9,7 +9,7 @@
- // CHECK: [[col_layout:#.*]] = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [0, 1]}>
- // CHECK: [[col_layout_novec:#.*]] = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
- // CHECK-LABEL: cst
--func @cst() -> tensor<1024xi32, #layout1> {
-+func.func @cst() -> tensor<1024xi32, #layout1> {
-   %cst = arith.constant dense<0> : tensor<1024xi32, #layout0>
-   %1 = triton_gpu.convert_layout %cst : (tensor<1024xi32, #layout0>) -> tensor<1024xi32, #layout1>
-   // CHECK-NOT: triton_gpu.convert_layout
-@@ -18,7 +18,7 @@ func @cst() -> tensor<1024xi32, #layout1> {
- }
- 
- // CHECK-LABEL: range
--func @range() -> tensor<1024xi32, #layout1> {
-+func.func @range() -> tensor<1024xi32, #layout1> {
-   %0 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #layout0>
-   %1 = triton_gpu.convert_layout %0 : (tensor<1024xi32, #layout0>) -> tensor<1024xi32, #layout1>
-   // CHECK-NOT: triton_gpu.convert_layout
-@@ -27,7 +27,7 @@ func @range() -> tensor<1024xi32, #layout1> {
- }
- 
- // CHECK-LABEL: splat
--func @splat(%arg0: i32) -> tensor<1024xi32, #layout1> {
-+func.func @splat(%arg0: i32) -> tensor<1024xi32, #layout1> {
-   %0 = tt.splat %arg0 : (i32) -> tensor<1024xi32, #layout0>
-   %1 = triton_gpu.convert_layout %0 : (tensor<1024xi32, #layout0>) -> tensor<1024xi32, #layout1>
-   // CHECK-NOT: triton_gpu.convert_layout
-@@ -36,7 +36,7 @@ func @splat(%arg0: i32) -> tensor<1024xi32, #layout1> {
- }
- 
- // CHECK-LABEL: remat
--func @remat(%arg0: i32) -> tensor<1024xi32, #layout1> {
-+func.func @remat(%arg0: i32) -> tensor<1024xi32, #layout1> {
-   %0 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #layout0>
-   %1 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #layout0>
-   %2 = arith.muli %0, %1 : tensor<1024xi32, #layout0>
-@@ -56,7 +56,7 @@ func @remat(%arg0: i32) -> tensor<1024xi32, #layout1> {
- }
- 
- // CHECK-LABEL: remat_load_store
--func @remat_load_store(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-+func.func @remat_load_store(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-   %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #layout0>
-   %1 = tt.splat %arg : (!tt.ptr<i32>) -> tensor<64x!tt.ptr<i32>, #layout0>
-   %2 = tt.addptr %1, %0 : tensor<64x!tt.ptr<i32>, #layout0>, tensor<64xi32, #layout0>
-@@ -70,7 +70,7 @@ func @remat_load_store(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
- 
- // Don't rematerialize vectorized loads
- // CHECK-LABEL: remat_expensive
--func @remat_expensive(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-+func.func @remat_expensive(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-   %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #layout1>
-   %1 = tt.splat %arg : (!tt.ptr<i32>) -> tensor<64x!tt.ptr<i32>, #layout1>
-   %2 = tt.addptr %1, %0 : tensor<64x!tt.ptr<i32>, #layout1>, tensor<64xi32, #layout1>
-@@ -85,7 +85,7 @@ func @remat_expensive(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
- 
- // Don't rematerialize loads when original and target layouts are different
- // CHECK-LABEL: remat_multi_layout
--func @remat_multi_layout(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-+func.func @remat_multi_layout(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-   %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #layout0>
-   %1 = tt.splat %arg : (!tt.ptr<i32>) -> tensor<64x!tt.ptr<i32>, #layout0>
-   %2 = tt.addptr %1, %0 : tensor<64x!tt.ptr<i32>, #layout0>, tensor<64xi32, #layout0>
-@@ -100,7 +100,7 @@ func @remat_multi_layout(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
- 
- // Always rematerialize single value loads
- // CHECK-LABEL: remat_single_value
--func @remat_single_value(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-+func.func @remat_single_value(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-   %0 = tt.splat %arg : (!tt.ptr<i32>) -> tensor<1x!tt.ptr<i32>, #layout1>
-   %1 = tt.load %0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1xi32, #layout1>
-   // CHECK-NOT: triton_gpu.convert_layout
-@@ -111,7 +111,7 @@ func @remat_single_value(%arg: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
- }
- 
- // CHECK-LABEL: if
--func @if(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-+func.func @if(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-   // CHECK-NOT: triton_gpu.convert_layout
-   %c32_i32 = arith.constant dense<32> : tensor<1024xi32, #layout1>
-   %0 = tt.get_program_id {axis = 0 : i32} : i32
-@@ -128,7 +128,7 @@ func @if(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
- }
- 
- // CHECK-LABEL: if_convert_else_not
--func @if_convert_else_not(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-+func.func @if_convert_else_not(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-   %c32_i32 = arith.constant dense<32> : tensor<1024xi32, #layout0>
-   %0 = tt.get_program_id {axis = 0 : i32} : i32
-   %1 = tt.splat %0 : (i32) -> tensor<1024xi32, #layout0>
-@@ -149,7 +149,7 @@ func @if_convert_else_not(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16
- }
- 
- // CHECK-LABEL: if_not_else_convert
--func @if_not_else_convert(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-+func.func @if_not_else_convert(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-   %c32_i32 = arith.constant dense<32> : tensor<1024xi32, #layout0>
-   %0 = tt.get_program_id {axis = 0 : i32} : i32
-   %1 = tt.splat %0 : (i32) -> tensor<1024xi32, #layout0>
-@@ -170,7 +170,7 @@ func @if_not_else_convert(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16
- }
- 
- // CHECK-LABEL: if_else_both_convert
--func @if_else_both_convert(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-+func.func @if_else_both_convert(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16 : i32}) {
-   %c32_i32 = arith.constant dense<32> : tensor<1024xi32, #layout0>
-   %0 = tt.get_program_id {axis = 0 : i32} : i32
-   %1 = tt.splat %0 : (i32) -> tensor<1024xi32, #layout0>
-@@ -200,7 +200,7 @@ func @if_else_both_convert(%arg0: i32, %arg1: !tt.ptr<i32> {tt.divisibility = 16
- #blocked4 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [0, 1]}>
- 
- // CHECK-LABEL: transpose
--func @transpose(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}) {
-+func.func @transpose(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}) {
-   // CHECK-NOT: triton_gpu.convert_layout
-   // CHECK: [[loaded_val:%.*]] = tt.load {{.*}}, {{%cst.*}}, {{%cst.*}} {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x64xf32, [[row_layout]]>
-   // CHECK: [[cvt_val:%.*]] = triton_gpu.convert_layout [[loaded_val]] : (tensor<64x64xf32, [[row_layout]]>) -> tensor<64x64xf32, [[col_layout]]>
-@@ -241,7 +241,7 @@ func @transpose(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32 {tt
- }
- 
- // CHECK-LABEL: loop
--func @loop(%arg0: !tt.ptr<f32>, %arg1: i32, %arg2: !tt.ptr<f32>, %arg3: i32, %arg4: i32) {
-+func.func @loop(%arg0: !tt.ptr<f32>, %arg1: i32, %arg2: !tt.ptr<f32>, %arg3: i32, %arg4: i32) {
-     // CHECK-NOT: triton_gpu.convert_layout
-     // CHECK: [[loop_ret:%.*]]:2 = scf.for {{.*}} -> (tensor<64x64xf32, [[row_layout]]>, tensor<64x64x!tt.ptr<f32>, [[row_layout]]>)
-     // CHECK-NEXT: {{.*}} = tt.load {{.*}} : tensor<64x64xf32, [[row_layout]]>
-@@ -295,7 +295,7 @@ func @loop(%arg0: !tt.ptr<f32>, %arg1: i32, %arg2: !tt.ptr<f32>, %arg3: i32, %ar
- }
- 
- // CHECK-LABEL: vecadd
--func @vecadd(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32) {
-+func.func @vecadd(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32) {
-   // CHECK-NOT: triton_gpu.convert_layout
-   %c256_i32 = arith.constant 256 : i32
-   %0 = tt.get_program_id {axis = 0 : i32} : i32
-@@ -327,7 +327,7 @@ func @vecadd(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f3
- 
- // Select has args with different element types
- // CHECK-LABEL: select
--func @select(%arg0: !tt.ptr<f64> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f64> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}) {
-+func.func @select(%arg0: !tt.ptr<f64> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f64> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}) {
-   // CHECK-NOT: triton_gpu.convert_layout
-   %cst = arith.constant dense<30000> : tensor<1x1xi32, #blocked2>
-   %cst_0 = arith.constant dense<30000> : tensor<1x512xi32, #blocked2>
-@@ -378,7 +378,7 @@ func @select(%arg0: !tt.ptr<f64> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f6
- 
- // Make sure the following IR doesn't hang the compiler.
- // CHECK-LABEL: long_func
--func public @long_func(%arg0: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f64> {tt.divisibility = 16 : i32}, %arg10: !tt.ptr<f64> {tt.divisibility = 16 : i32}, %arg11: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg12: !tt.ptr<i32> {tt.divisibility = 16 : i32}, %arg13: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg14: !tt.ptr<f64> {tt.divisibility = 16 : i32}, %arg15: !tt.ptr<f64> {tt.divisibility = 16 : i32}, %arg16: i32 {tt.divisibility = 16 : i32}) {
-+func.func public @long_func(%arg0: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f64> {tt.divisibility = 16 : i32}, %arg10: !tt.ptr<f64> {tt.divisibility = 16 : i32}, %arg11: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg12: !tt.ptr<i32> {tt.divisibility = 16 : i32}, %arg13: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg14: !tt.ptr<f64> {tt.divisibility = 16 : i32}, %arg15: !tt.ptr<f64> {tt.divisibility = 16 : i32}, %arg16: i32 {tt.divisibility = 16 : i32}) {
-   %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked0>
-   %cst_0 = arith.constant dense<5.000000e-04> : tensor<1024xf32, #blocked0>
-   %cst_1 = arith.constant dense<0.999499976> : tensor<1024xf32, #blocked0>
-@@ -775,7 +775,7 @@ func public @long_func(%arg0: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg1:
- // A mnist model from torch inductor.
- // Check if topological sort is working correct and there's no unnecessary convert
- // CHECK-LABEL: mnist
--func public @mnist(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}, %arg3: i32) {
-+func.func public @mnist(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}, %arg3: i32) {
-   // CHECK-NOT: triton_gpu.convert_layout
-   %cst = arith.constant dense<10> : tensor<16x1xi32, #blocked2>
-   %cst_0 = arith.constant dense<10> : tensor<1x16xi32, #blocked3>
-@@ -862,7 +862,7 @@ func public @mnist(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.
- #blocked5 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
- // cmpf and cmpi have different operands and result types
- // CHECK-LABEL: cmp
--func public @cmp(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) {
-+func.func public @cmp(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) {
-   %c64 = arith.constant 64 : index
-   %c2048 = arith.constant 2048 : index
-   %c0 = arith.constant 0 : index
-diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir
-index 6ee3b15fbc..663f2da7b0 100644
---- a/test/TritonGPU/loop-pipeline.mlir
-+++ b/test/TritonGPU/loop-pipeline.mlir
-@@ -10,7 +10,7 @@
- #A = #triton_gpu.dot_op<{opIdx = 0, parent = #C}>
- #B = #triton_gpu.dot_op<{opIdx = 1, parent = #C}>
- 
--// CHECK: func @matmul_loop
-+// CHECK: func.func @matmul_loop
- // CHECK-DAG: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
- // CHECK-DAG: %[[CONSTANT_1:.*]] = arith.constant 1 : i32
- // CHECK-DAG: %[[CONSTANT_2:.*]] = arith.constant 2 : i32
-@@ -46,8 +46,8 @@
- // CHECK-DAG: %[[NEXT_PIPELINE_IDX:.*]] = arith.addi %[[PIPELINE_IDX]], %[[CONSTANT_1]]
- // CHECK-DAG: %[[NEXT_LOOP_IDX:.*]] = arith.addi %[[LOOP_IDX]], %[[CONSTANT_1]]
- // CHECK:   scf.yield {{.*}}, {{.*}}, {{.*}}, %[[NEXT_A_BUFFER]], %[[NEXT_B_BUFFER]], %[[NEXT_A]], %[[NEXT_B]], {{.*}}, {{.*}}, {{.*}}, %[[NEXT_PIPELINE_IDX]], %[[NEXT_LOOP_IDX]]
--func @matmul_loop(%lb : index, %ub : index, %step : index, 
--                  %A : !tt.ptr<f16> {tt.divisibility = 16 : i32}, 
-+func.func @matmul_loop(%lb : index, %ub : index, %step : index,
-+                  %A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
-                   %B : !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
-   // A ptrs
-   %a_ptr_splat = tt.splat %A : (!tt.ptr<f16>) -> tensor<128x32x!tt.ptr<f16>, #AL>
-@@ -61,7 +61,7 @@ func @matmul_loop(%lb : index, %ub : index, %step : index,
-   %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : (tensor<128xi32, #BLs0>) -> tensor<1x128xi32, #BL>
-   %b_offs = tt.broadcast %b_tmp1 : (tensor<1x128xi32, #BL>) -> tensor<32x128xi32, #BL>
-   %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr<f16>, #BL>, tensor<32x128xi32, #BL>
--  
-+
- 
-   %a_mask = arith.constant dense<true> : tensor<128x32xi1, #AL>
-   %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL>
-@@ -88,7 +88,7 @@ func @matmul_loop(%lb : index, %ub : index, %step : index,
- }
- 
- 
--// CHECK: func @matmul_loop_nested
-+// CHECK: func.func @matmul_loop_nested
- // CHECK-DAG: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
- // CHECK-DAG: %[[CONSTANT_1:.*]] = arith.constant 1 : i32
- // CHECK-DAG: %[[CONSTANT_2:.*]] = arith.constant 2 : i32
-@@ -118,8 +118,8 @@ func @matmul_loop(%lb : index, %ub : index, %step : index,
- // CHECK-DAG: %[[NEXT_PIPELINE_IDX:.*]] = arith.addi %[[PIPELINE_IDX]], %[[CONSTANT_1]]
- // CHECK-DAG: %[[NEXT_LOOP_IDX:.*]] = arith.addi %[[LOOP_IDX]], %[[CONSTANT_1]]
- // CHECK:     scf.yield {{.*}}, {{.*}}, {{.*}}, %[[NEXT_A_BUFFER]], %[[NEXT_B_BUFFER]], %[[NEXT_A]], %[[NEXT_B]], {{.*}}, {{.*}}, {{.*}}, %[[NEXT_PIPELINE_IDX]], %[[NEXT_LOOP_IDX]]
--func @matmul_loop_nested(%lb : index, %ub : index, %step : index, 
--                         %A : !tt.ptr<f16> {tt.divisibility = 16 : i32}, 
-+func.func @matmul_loop_nested(%lb : index, %ub : index, %step : index,
-+                         %A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
-                          %B : !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
-   scf.for %iv0 = %lb to %ub step %step {
-     // A ptrs
-@@ -134,7 +134,7 @@ func @matmul_loop_nested(%lb : index, %ub : index, %step : index,
-     %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : (tensor<128xi32, #BLs0>) -> tensor<1x128xi32, #BL>
-     %b_offs = tt.broadcast %b_tmp1 : (tensor<1x128xi32, #BL>) -> tensor<32x128xi32, #BL>
-     %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr<f16>, #BL>, tensor<32x128xi32, #BL>
--  
-+
-     %a_mask = arith.constant dense<true> : tensor<128x32xi1, #AL>
-     %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL>
-     %b_mask = arith.constant dense<true> : tensor<32x128xi1, #BL>
-@@ -161,7 +161,7 @@ func @matmul_loop_nested(%lb : index, %ub : index, %step : index,
- }
- 
- 
--// CHECK: func @matmul_loop_single_pipeline
-+// CHECK: func.func @matmul_loop_single_pipeline
- // CHECK-DAG: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
- // CHECK-DAG: %[[CONSTANT_1:.*]] = arith.constant 1 : i32
- // CHECK-DAG: %[[CONSTANT_2:.*]] = arith.constant 2 : i32
-@@ -183,8 +183,8 @@ func @matmul_loop_nested(%lb : index, %ub : index, %step : index,
- // CHECK-DAG: %[[NEXT_PIPELINE_IDX:.*]] = arith.addi %[[PIPELINE_IDX]], %[[CONSTANT_1]]
- // CHECK-DAG: %[[NEXT_LOOP_IDX:.*]] = arith.addi %[[LOOP_IDX]], %[[CONSTANT_1]]
- // CHECK:   scf.yield {{.*}}, {{.*}}, %[[NEXT_B_BUFFER]], %[[NEXT_B]], {{.*}}, {{.*}}, %[[NEXT_PIPELINE_IDX]], %[[NEXT_LOOP_IDX]]
--func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, 
--                                  %A : !tt.ptr<f16> {tt.divisibility = 16 : i32}, 
-+func.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
-+                                  %A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
-                                   %B : !tt.ptr<f16> {tt.divisibility = 16 : i32}) {
-   // A ptrs
-   %a_ptr_splat = tt.splat %A : (!tt.ptr<f16>) -> tensor<128x32x!tt.ptr<f16>, #AL>
-diff --git a/test/TritonGPU/matmul.mlir b/test/TritonGPU/matmul.mlir
-index 9bd5318e1e..01dc3f0ab1 100644
---- a/test/TritonGPU/matmul.mlir
-+++ b/test/TritonGPU/matmul.mlir
-@@ -4,7 +4,7 @@
- // CHECK: offset = 49152, size = 49152
- // CHECK: size = 98304
- module {
--func @matmul_kernel__Pfp32_Pfp32_Pfp32_i32_i32_i32_i32_i32_i32_i32_i32_i32__12c64_13c64_14c64_15c8(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32, %arg8: i32 {tt.divisibility = 16 : i32}, %arg9: i32, %arg10: i32 {tt.divisibility = 16 : i32}, %arg11: i32) {
-+func.func @matmul_kernel__Pfp32_Pfp32_Pfp32_i32_i32_i32_i32_i32_i32_i32_i32_i32__12c64_13c64_14c64_15c8(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32, %arg8: i32 {tt.divisibility = 16 : i32}, %arg9: i32, %arg10: i32 {tt.divisibility = 16 : i32}, %arg11: i32) {
-     %cst = arith.constant dense<true> : tensor<64x64xi1>
-     %c64 = arith.constant 64 : index
-     %c0 = arith.constant 0 : index
-@@ -22,7 +22,7 @@ func @matmul_kernel__Pfp32_Pfp32_Pfp32_i32_i32_i32_i32_i32_i32_i32_i32_i32__12c6
-     %7 = arith.muli %6, %c8_i32 : i32
-     %8 = arith.subi %2, %7 : i32
-     %9 = arith.cmpi slt, %8, %c8_i32 : i32
--    %10 = select %9, %8, %c8_i32 : i32
-+    %10 = arith.select %9, %8, %c8_i32 : i32
-     %11 = arith.remsi %0, %10 : i32
-     %12 = arith.addi %7, %11 : i32
-     %13 = arith.remsi %0, %5 : i32
-diff --git a/test/TritonGPU/prefetch.mlir b/test/TritonGPU/prefetch.mlir
-index 52b4dddec1..b427547890 100644
---- a/test/TritonGPU/prefetch.mlir
-+++ b/test/TritonGPU/prefetch.mlir
-@@ -11,7 +11,7 @@
- #B_OP = #triton_gpu.dot_op<{opIdx = 1, parent = #C}>
- 
- 
--// CHECK: func @matmul_loop
-+// CHECK: func.func @matmul_loop
- // CHECK-DAG: %[[A0_PREFETCH_SMEM:.*]] = tensor.extract_slice %[[A0:.*]][0, 0] [128, 16]
- // CHECK-DAG: %[[A0_PREFETCH:.*]] = triton_gpu.convert_layout %[[A0_PREFETCH_SMEM]]
- // CHECK-DAG: %[[B0_PREFETCH_SMEM:.*]] = tensor.extract_slice %[[B0:.*]][0, 0] [16, 128]
-@@ -28,7 +28,7 @@
- // CHECK-DAG:   %[[NEXT_B_PREFETCH_SMEM:.*]] = tensor.extract_slice {{.*}}[0, 0] [16, 128]
- // CHECK-DAG:   %[[NEXT_B_PREFETCH:.*]] = triton_gpu.convert_layout %[[NEXT_B_PREFETCH_SMEM]]
- // CHECK:     scf.yield {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[NEXT_A_PREFETCH]], %[[NEXT_B_PREFETCH]]
--func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-+func.func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
-   %a_ptr_init = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<128x32x!tt.ptr<f16>, #AL>
-   %b_ptr_init = tt.broadcast %B : (!tt.ptr<f16>) -> tensor<32x128x!tt.ptr<f16>, #BL>
- 
-diff --git a/test/TritonGPU/update-mma-for-volta.mlir b/test/TritonGPU/update-mma-for-volta.mlir
-index d587fffcca..7571ec6185 100644
---- a/test/TritonGPU/update-mma-for-volta.mlir
-+++ b/test/TritonGPU/update-mma-for-volta.mlir
-@@ -15,7 +15,7 @@
- // CHECK: [[new_mma:#mma.*]] = #triton_gpu.mma<{versionMajor = 1, versionMinor = 3, warpsPerCTA = [4, 2]}>
- module attributes {"triton_gpu.num-warps" = 16 : i32} {
-   // CHECK-LABEL: dot_mmav1
--  func @dot_mmav1(%A: tensor<64x64xf16, #blocked0>, %B: tensor<64x64xf16, #blocked0>) -> tensor<64x64xf32, #blocked0> {
-+  func.func @dot_mmav1(%A: tensor<64x64xf16, #blocked0>, %B: tensor<64x64xf16, #blocked0>) -> tensor<64x64xf32, #blocked0> {
-     %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked0>
-     %AA = triton_gpu.convert_layout %A : (tensor<64x64xf16, #blocked0>) -> tensor<64x64xf16, #dot_operand_a>
-     %BB = triton_gpu.convert_layout %B : (tensor<64x64xf16, #blocked0>) -> tensor<64x64xf16, #dot_operand_b>
-@@ -50,7 +50,7 @@ module attributes {"triton_gpu.num-warps" = 16 : i32} {
- 
- module attributes {"triton_gpu.num-warps" = 16 : i32} {
-   // CHECK-LABEL: dot_mmav1
--  func @dot_mmav1(%A: tensor<64x64xf16, #blocked0>, %B: tensor<64x64xf16, #blocked0>) -> tensor<64x64xf32, #blocked0> {
-+  func.func @dot_mmav1(%A: tensor<64x64xf16, #blocked0>, %B: tensor<64x64xf16, #blocked0>) -> tensor<64x64xf32, #blocked0> {
-     %C = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked0>
-     %AA = triton_gpu.convert_layout %A : (tensor<64x64xf16, #blocked0>) -> tensor<64x64xf16, #dot_operand_a>
-     %BB = triton_gpu.convert_layout %B : (tensor<64x64xf16, #blocked0>) -> tensor<64x64xf16, #dot_operand_b>
-diff --git a/test/lib/Analysis/TestAlias.cpp b/test/lib/Analysis/TestAlias.cpp
-index 88a4118fe9..3fd0cfd0d3 100644
---- a/test/lib/Analysis/TestAlias.cpp
-+++ b/test/lib/Analysis/TestAlias.cpp
-@@ -9,10 +9,10 @@ using namespace mlir;
- namespace {
- 
- struct TestAliasPass
--    : public PassWrapper<TestAliasPass, OperationPass<FuncOp>> {
-+    : public PassWrapper<TestAliasPass, OperationPass<func::FuncOp>> {
-+
-+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAliasPass);
- 
--  // LLVM15+
--  // MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAliasPass);
-   static void print(StringRef name, SmallVector<std::string, 4> &vals,
-                     raw_ostream &os) {
-     if (vals.empty())
-@@ -39,23 +39,24 @@ struct TestAliasPass
-     auto opName = SymbolTable::getSymbolName(operation).getValue().str();
-     os << opName << "\n";
- 
--    SharedMemoryAliasAnalysis analysis(&getContext());
--    analysis.run(operation);
-+    std::unique_ptr<DataFlowSolver> solver = createDataFlowSolver();
-+    SharedMemoryAliasAnalysis *analysis =
-+        solver->load<SharedMemoryAliasAnalysis>();
-+    if (failed(solver->initializeAndRun(operation)))
-+      return signalPassFailure();
- 
-     AsmState state(operation->getParentOfType<ModuleOp>());
-     // Get operation ids of value's aliases
-     auto getAllocOpNames = [&](Value value) {
--      LatticeElement<AliasInfo> *latticeElement =
--          analysis.lookupLatticeElement(value);
-+      dataflow::Lattice<AliasInfo> *latticeElement =
-+          analysis->getLatticeElement(value);
-       SmallVector<std::string, 4> opNames;
--      if (latticeElement) {
-+      if (latticeElement && !latticeElement->isUninitialized()) {
-         auto &info = latticeElement->getValue();
--        if (!info.getAllocs().empty()) {
--          for (auto &alias : info.getAllocs()) {
--            auto opName =
--                getValueOperandName(alias.getDefiningOp()->getResult(0), state);
--            opNames.push_back(std::move(opName));
--          }
-+        for (auto &alias : info.getAllocs()) {
-+          auto opName =
-+              getValueOperandName(alias.getDefiningOp()->getResult(0), state);
-+          opNames.push_back(std::move(opName));
-         }
-       }
-       // Ensure deterministic output
-diff --git a/test/lib/Analysis/TestAllocation.cpp b/test/lib/Analysis/TestAllocation.cpp
-index 84108c4d36..35e42242bd 100644
---- a/test/lib/Analysis/TestAllocation.cpp
-+++ b/test/lib/Analysis/TestAllocation.cpp
-@@ -6,10 +6,9 @@ using namespace mlir;
- namespace {
- 
- struct TestAllocationPass
--    : public PassWrapper<TestAllocationPass, OperationPass<FuncOp>> {
-+    : public PassWrapper<TestAllocationPass, OperationPass<func::FuncOp>> {
- 
--  // LLVM15+
--  // MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAllocationPass);
-+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAllocationPass);
- 
-   StringRef getArgument() const final { return "test-print-allocation"; }
-   StringRef getDescription() const final {
-diff --git a/test/lib/Analysis/TestAxisInfo.cpp b/test/lib/Analysis/TestAxisInfo.cpp
-index a5205bb0a0..22347c32f0 100644
---- a/test/lib/Analysis/TestAxisInfo.cpp
-+++ b/test/lib/Analysis/TestAxisInfo.cpp
-@@ -1,25 +1,15 @@
- #include "mlir/Pass/Pass.h"
- #include "triton/Analysis/AxisInfo.h"
-+#include "triton/Analysis/Utility.h"
- 
- using namespace mlir;
- 
- namespace {
- 
- struct TestAxisInfoPass
--    : public PassWrapper<TestAxisInfoPass, OperationPass<FuncOp>> {
-+    : public PassWrapper<TestAxisInfoPass, OperationPass<func::FuncOp>> {
- 
--  // LLVM15+
--  // MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAlignmentPass);
--
--  void print(const std::string &name, raw_ostream &os, ArrayRef<int64_t> vals) {
--    os << name << ": [";
--    for (size_t d = 0; d < vals.size(); d++) {
--      if (d != 0)
--        os << ", ";
--      os << vals[d];
--    }
--    os << "]";
--  }
-+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAxisInfoPass);
- 
-   StringRef getArgument() const final { return "test-print-alignment"; }
-   StringRef getDescription() const final {
-@@ -30,38 +20,19 @@ struct TestAxisInfoPass
-     Operation *operation = getOperation();
-     auto &os = llvm::errs();
-     auto opName = SymbolTable::getSymbolName(operation).getValue().str();
--    os << opName << "\n";
--    AxisInfoAnalysis analysis(&getContext());
--    analysis.run(operation);
-+    os << "@" << opName << "\n";
-+
-+    std::unique_ptr<DataFlowSolver> solver = createDataFlowSolver();
-+    AxisInfoAnalysis *analysis = solver->load<AxisInfoAnalysis>();
-+    if (failed(solver->initializeAndRun(operation)))
-+      return signalPassFailure();
-     operation->walk([&](Operation *op) {
-       if (op->getNumResults() < 1)
-         return;
-       for (Value result : op->getResults()) {
--        // std::ostringstream oss;
--        // result.print(oss);
--        // os << " => ";
--        LatticeElement<AxisInfo> *latticeElement =
--            analysis.lookupLatticeElement(result);
--        if (!latticeElement) {
--          os << "None\n";
--          return;
--        }
--        AxisInfo &info = latticeElement->getValue();
--        print("Contiguity", os, info.getContiguity());
--        os << " ; ";
--        print("Divisibility", os, info.getDivisibility());
--        os << " ; ";
--        print("Constancy", os, info.getConstancy());
--        os << " ; ";
--        auto constantValue = info.getConstantValue();
--        os << "ConstantValue: [";
--        if (constantValue.has_value())
--          os << constantValue.value();
--        else
--          os << "None";
--        os << "] ( ";
-         result.print(os);
--        os << " ) ";
-+        os << " => ";
-+        analysis->getLatticeElement(result)->getValue().print(os);
-         os << "\n";
-       }
-     });
-diff --git a/test/lib/Analysis/TestMembar.cpp b/test/lib/Analysis/TestMembar.cpp
-index df4279fe24..ab9b9f3fb7 100644
---- a/test/lib/Analysis/TestMembar.cpp
-+++ b/test/lib/Analysis/TestMembar.cpp
-@@ -1,4 +1,4 @@
--#include "mlir/Dialect/GPU/GPUDialect.h"
-+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
- #include "mlir/IR/Dialect.h"
- #include "mlir/Pass/Pass.h"
- #include "triton/Analysis/Allocation.h"
-@@ -9,10 +9,9 @@ using namespace mlir;
- namespace {
- 
- struct TestMembarPass
--    : public PassWrapper<TestMembarPass, OperationPass<FuncOp>> {
-+    : public PassWrapper<TestMembarPass, OperationPass<func::FuncOp>> {
- 
--  // LLVM15+
--  // MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestMembarPass);
-+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestMembarPass);
- 
-   StringRef getArgument() const final { return "test-print-membar"; }
-   StringRef getDescription() const final {
diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix
index 16751903fe50..e2d512318111 100644
--- a/pkgs/top-level/python-packages.nix
+++ b/pkgs/top-level/python-packages.nix
@@ -8351,7 +8351,7 @@ self: super: with self; {
 
   open-meteo = callPackage ../development/python-modules/open-meteo { };
 
-  openai-triton = callPackage ../development/python-modules/openai-triton { llvmPackages = pkgs.rocmPackages.llvm; };
+  openai-triton = callPackage ../development/python-modules/openai-triton { cudaPackages = pkgs.cudaPackages_12_0; };
 
   openai-triton-bin = callPackage ../development/python-modules/openai-triton/bin.nix { };
 

From 68c237239bf1b10f512a53308ef22c68b8d2feb3 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sat, 7 Oct 2023 00:33:21 -0500
Subject: [PATCH 26/30] frugally_deep: init at 0.15.24-p0

---
 .../libraries/frugally-deep/default.nix       | 53 +++++++++++++++++++
 pkgs/top-level/all-packages.nix               |  2 +
 2 files changed, 55 insertions(+)
 create mode 100644 pkgs/development/libraries/frugally-deep/default.nix

diff --git a/pkgs/development/libraries/frugally-deep/default.nix b/pkgs/development/libraries/frugally-deep/default.nix
new file mode 100644
index 000000000000..f275ec5f02ba
--- /dev/null
+++ b/pkgs/development/libraries/frugally-deep/default.nix
@@ -0,0 +1,53 @@
+{ lib
+, stdenv
+, fetchFromGitHub
+, gitUpdater
+, cmake
+, functionalplus
+, eigen
+, nlohmann_json
+, doctest
+, python3Packages
+, buildTests ? false # Needs tensorflow
+}:
+
+stdenv.mkDerivation (finalAttrs: {
+  pname = "frugally-deep";
+  version = "0.15.24-p0";
+
+  src = fetchFromGitHub {
+    owner = "Dobiasd";
+    repo = "frugally-deep";
+    rev = "v${finalAttrs.version}";
+    hash = "sha256-yg2SMsYOOSOgsdwIH1bU3iPM45z6c7WeIrgOddt3um4=";
+  };
+
+  nativeBuildInputs = [
+    cmake
+  ] ++ lib.optionals buildTests [
+    python3Packages.python
+    python3Packages.numpy
+  ];
+
+  buildInputs = lib.optionals buildTests [
+    doctest
+    python3Packages.tensorflow
+  ];
+
+  propagatedBuildInputs = [
+    functionalplus
+    eigen
+    nlohmann_json
+  ];
+
+  cmakeFlags = lib.optionals buildTests [ "-DFDEEP_BUILD_UNITTEST=ON" ];
+  passthru.updateScript = gitUpdater;
+
+  meta = with lib; {
+    description = "Header-only library for using Keras (TensorFlow) models in C++";
+    homepage = "https://github.com/Dobiasd/frugally-deep";
+    license = with licenses; [ mit ];
+    maintainers = with maintainers; [ Madouura ];
+    platforms = platforms.linux;
+  };
+})
diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index a62783c951af..6f54e51f0f95 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -685,6 +685,8 @@ with pkgs;
 
   frugal = callPackage ../development/tools/frugal { };
 
+  frugally-deep = callPackage ../development/libraries/frugally-deep { };
+
   functiontrace-server = callPackage ../development/tools/functiontrace-server { };
 
   gendef = callPackage ../development/tools/gendef { };

From 1abbe92d900f117b8a537d7be59e5383303c11f2 Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sat, 7 Oct 2023 05:39:37 -0500
Subject: [PATCH 27/30] rocmPackages.rocblas: split up output for hydra caching

---
 .../rocm-modules/5/rocblas/default.nix        | 259 +++++++++++-------
 1 file changed, 159 insertions(+), 100 deletions(-)

diff --git a/pkgs/development/rocm-modules/5/rocblas/default.nix b/pkgs/development/rocm-modules/5/rocblas/default.nix
index f1cd81df663f..76dc38850d57 100644
--- a/pkgs/development/rocm-modules/5/rocblas/default.nix
+++ b/pkgs/development/rocm-modules/5/rocblas/default.nix
@@ -2,6 +2,7 @@
 , stdenv
 , fetchFromGitHub
 , rocmUpdateScript
+, runCommand
 , cmake
 , rocm-cmake
 , clr
@@ -24,11 +25,147 @@
 , tensileLibFormat ? "msgpack"
 , gpuTargets ? [ "all" ]
 }:
+let
+  rocblas = stdenv.mkDerivation (finalAttrs: {
+    pname = "rocblas";
+    version = "5.7.0";
 
-# rocBLAS is 3.7GB... I'll have to figure out hydra in another PR
-stdenv.mkDerivation (finalAttrs: {
-  pname = "rocblas";
-  version = "5.7.0";
+    outputs = [
+      "out"
+    ] ++ lib.optionals buildTests [
+      "test"
+    ] ++ lib.optionals buildBenchmarks [
+      "benchmark"
+    ];
+
+    src = fetchFromGitHub {
+      owner = "ROCmSoftwarePlatform";
+      repo = "rocBLAS";
+      rev = "rocm-${finalAttrs.version}";
+      hash = "sha256-3wKnwvAra8u9xqlC05wUD+gSoBILTVJFU2cIV6xv3Lk=";
+    };
+
+    nativeBuildInputs = [
+      cmake
+      rocm-cmake
+      clr
+    ];
+
+    buildInputs = [
+      python3
+    ] ++ lib.optionals buildTensile [
+      msgpack
+      libxml2
+      python3Packages.msgpack
+      python3Packages.joblib
+    ] ++ lib.optionals buildTests [
+      gtest
+    ] ++ lib.optionals (buildTests || buildBenchmarks) [
+      gfortran
+      openmp
+      amd-blis
+    ] ++ lib.optionals (buildTensile || buildTests || buildBenchmarks) [
+      python3Packages.pyyaml
+    ];
+
+    cmakeFlags = [
+      "-DCMAKE_C_COMPILER=hipcc"
+      "-DCMAKE_CXX_COMPILER=hipcc"
+      "-Dpython=python3"
+      "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
+      "-DBUILD_WITH_TENSILE=${if buildTensile then "ON" else "OFF"}"
+      # Manually define CMAKE_INSTALL_<DIR>
+      # See: https://github.com/NixOS/nixpkgs/pull/197838
+      "-DCMAKE_INSTALL_BINDIR=bin"
+      "-DCMAKE_INSTALL_LIBDIR=lib"
+      "-DCMAKE_INSTALL_INCLUDEDIR=include"
+    ] ++ lib.optionals buildTensile [
+      "-DVIRTUALENV_HOME_DIR=/build/source/tensile"
+      "-DTensile_TEST_LOCAL_PATH=/build/source/tensile"
+      "-DTensile_ROOT=/build/source/tensile/lib/python${python3.pythonVersion}/site-packages/Tensile"
+      "-DTensile_LOGIC=${tensileLogic}"
+      "-DTensile_CODE_OBJECT_VERSION=${tensileCOVersion}"
+      "-DTensile_SEPARATE_ARCHITECTURES=${if tensileSepArch then "ON" else "OFF"}"
+      "-DTensile_LAZY_LIBRARY_LOADING=${if tensileLazyLib then "ON" else "OFF"}"
+      "-DTensile_LIBRARY_FORMAT=${tensileLibFormat}"
+    ] ++ lib.optionals buildTests [
+      "-DBUILD_CLIENTS_TESTS=ON"
+    ] ++ lib.optionals buildBenchmarks [
+      "-DBUILD_CLIENTS_BENCHMARKS=ON"
+    ] ++ lib.optionals (buildTests || buildBenchmarks) [
+      "-DCMAKE_CXX_FLAGS=-I${amd-blis}/include/blis"
+    ];
+
+    # Tensile REALLY wants to write to the nix directory if we include it normally
+    postPatch = lib.optionalString buildTensile ''
+      cp -a ${tensile} tensile
+      chmod +w -R tensile
+
+      # Rewrap Tensile
+      substituteInPlace tensile/bin/{.t*,.T*,*} \
+        --replace "${tensile}" "/build/source/tensile"
+
+      substituteInPlace CMakeLists.txt \
+        --replace "include(virtualenv)" "" \
+        --replace "virtualenv_install(\''${Tensile_TEST_LOCAL_PATH})" ""
+    '';
+
+    postInstall = lib.optionalString buildTests ''
+      mkdir -p $test/bin
+      cp -a $out/bin/* $test/bin
+      rm $test/bin/*-bench || true
+    '' + lib.optionalString buildBenchmarks ''
+      mkdir -p $benchmark/bin
+      cp -a $out/bin/* $benchmark/bin
+      rm $benchmark/bin/*-test || true
+    '' + lib.optionalString (buildTests || buildBenchmarks ) ''
+      rm -rf $out/bin
+    '';
+
+    passthru.updateScript = rocmUpdateScript {
+      name = finalAttrs.pname;
+      owner = finalAttrs.src.owner;
+      repo = finalAttrs.src.repo;
+    };
+
+    requiredSystemFeatures = [ "big-parallel" ];
+
+    meta = with lib; {
+      description = "BLAS implementation for ROCm platform";
+      homepage = "https://github.com/ROCmSoftwarePlatform/rocBLAS";
+      license = with licenses; [ mit ];
+      maintainers = teams.rocm.members;
+      platforms = platforms.linux;
+      broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
+    };
+  });
+
+  gfx80 = runCommand "rocblas-gfx80" { preferLocalBuild = true; } ''
+    mkdir -p $out/lib/rocblas/library
+    cp -a ${rocblas}/lib/rocblas/library/*gfx80* $out/lib/rocblas/library
+  '';
+
+  gfx90 = runCommand "rocblas-gfx90" { preferLocalBuild = true; } ''
+    mkdir -p $out/lib/rocblas/library
+    cp -a ${rocblas}/lib/rocblas/library/*gfx90* $out/lib/rocblas/library
+  '';
+
+  gfx94 = runCommand "rocblas-gfx94" { preferLocalBuild = true; } ''
+    mkdir -p $out/lib/rocblas/library
+    cp -a ${rocblas}/lib/rocblas/library/*gfx94* $out/lib/rocblas/library
+  '';
+
+  gfx10 = runCommand "rocblas-gfx10" { preferLocalBuild = true; } ''
+    mkdir -p $out/lib/rocblas/library
+    cp -a ${rocblas}/lib/rocblas/library/*gfx10* $out/lib/rocblas/library
+  '';
+
+  gfx11 = runCommand "rocblas-gfx11" { preferLocalBuild = true; } ''
+    mkdir -p $out/lib/rocblas/library
+    cp -a ${rocblas}/lib/rocblas/library/*gfx11* $out/lib/rocblas/library
+  '';
+in stdenv.mkDerivation (finalAttrs: {
+  inherit (rocblas) pname version src passthru meta;
 
   outputs = [
     "out"
@@ -38,104 +175,26 @@ stdenv.mkDerivation (finalAttrs: {
     "benchmark"
   ];
 
-  src = fetchFromGitHub {
-    owner = "ROCmSoftwarePlatform";
-    repo = "rocBLAS";
-    rev = "rocm-${finalAttrs.version}";
-    hash = "sha256-3wKnwvAra8u9xqlC05wUD+gSoBILTVJFU2cIV6xv3Lk=";
-  };
+  dontUnpack = true;
+  dontPatch = true;
+  dontConfigure = true;
+  dontBuild = true;
 
-  nativeBuildInputs = [
-    cmake
-    rocm-cmake
-    clr
-  ];
+  installPhase = ''
+    runHook preInstall
 
-  buildInputs = [
-    python3
-  ] ++ lib.optionals buildTensile [
-    msgpack
-    libxml2
-    python3Packages.msgpack
-    python3Packages.joblib
-  ] ++ lib.optionals buildTests [
-    gtest
-  ] ++ lib.optionals (buildTests || buildBenchmarks) [
-    gfortran
-    openmp
-    amd-blis
-  ] ++ lib.optionals (buildTensile || buildTests || buildBenchmarks) [
-    python3Packages.pyyaml
-  ];
-
-  cmakeFlags = [
-    "-DCMAKE_C_COMPILER=hipcc"
-    "-DCMAKE_CXX_COMPILER=hipcc"
-    "-Dpython=python3"
-    "-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
-    "-DBUILD_WITH_TENSILE=${if buildTensile then "ON" else "OFF"}"
-    # Manually define CMAKE_INSTALL_<DIR>
-    # See: https://github.com/NixOS/nixpkgs/pull/197838
-    "-DCMAKE_INSTALL_BINDIR=bin"
-    "-DCMAKE_INSTALL_LIBDIR=lib"
-    "-DCMAKE_INSTALL_INCLUDEDIR=include"
-  ] ++ lib.optionals buildTensile [
-    "-DVIRTUALENV_HOME_DIR=/build/source/tensile"
-    "-DTensile_TEST_LOCAL_PATH=/build/source/tensile"
-    "-DTensile_ROOT=/build/source/tensile/lib/python${python3.pythonVersion}/site-packages/Tensile"
-    "-DTensile_LOGIC=${tensileLogic}"
-    "-DTensile_CODE_OBJECT_VERSION=${tensileCOVersion}"
-    "-DTensile_SEPARATE_ARCHITECTURES=${if tensileSepArch then "ON" else "OFF"}"
-    "-DTensile_LAZY_LIBRARY_LOADING=${if tensileLazyLib then "ON" else "OFF"}"
-    "-DTensile_LIBRARY_FORMAT=${tensileLibFormat}"
-  ] ++ lib.optionals buildTests [
-    "-DBUILD_CLIENTS_TESTS=ON"
-  ] ++ lib.optionals buildBenchmarks [
-    "-DBUILD_CLIENTS_BENCHMARKS=ON"
-  ] ++ lib.optionals (buildTests || buildBenchmarks) [
-    "-DCMAKE_CXX_FLAGS=-I${amd-blis}/include/blis"
-  ];
-
-  # Tensile REALLY wants to write to the nix directory if we include it normally
-  postPatch = lib.optionalString buildTensile ''
-    cp -a ${tensile} tensile
-    chmod +w -R tensile
-
-    # Rewrap Tensile
-    substituteInPlace tensile/bin/{.t*,.T*,*} \
-      --replace "${tensile}" "/build/source/tensile"
-
-    substituteInPlace CMakeLists.txt \
-      --replace "include(virtualenv)" "" \
-      --replace "virtualenv_install(\''${Tensile_TEST_LOCAL_PATH})" ""
-  '';
-
-  postInstall = lib.optionalString buildTests ''
-    mkdir -p $test/bin
-    cp -a $out/bin/* $test/bin
-    rm $test/bin/*-bench || true
+    mkdir -p $out
+    cp -a --no-preserve=mode ${rocblas}/* $out
+    ln -sf ${gfx80}/lib/rocblas/library/* $out/lib/rocblas/library
+    ln -sf ${gfx90}/lib/rocblas/library/* $out/lib/rocblas/library
+    ln -sf ${gfx94}/lib/rocblas/library/* $out/lib/rocblas/library
+    ln -sf ${gfx10}/lib/rocblas/library/* $out/lib/rocblas/library
+    ln -sf ${gfx11}/lib/rocblas/library/* $out/lib/rocblas/library
+  '' + lib.optionalString buildTests ''
+    cp -a ${rocblas.test} $test
   '' + lib.optionalString buildBenchmarks ''
-    mkdir -p $benchmark/bin
-    cp -a $out/bin/* $benchmark/bin
-    rm $benchmark/bin/*-test || true
-  '' + lib.optionalString (buildTests || buildBenchmarks ) ''
-    rm -rf $out/bin
+    cp -a ${rocblas.benchmark} $benchmark
+  '' + ''
+    runHook postInstall
   '';
-
-  passthru.updateScript = rocmUpdateScript {
-    name = finalAttrs.pname;
-    owner = finalAttrs.src.owner;
-    repo = finalAttrs.src.repo;
-  };
-
-  requiredSystemFeatures = [ "big-parallel" ];
-
-  meta = with lib; {
-    description = "BLAS implementation for ROCm platform";
-    homepage = "https://github.com/ROCmSoftwarePlatform/rocBLAS";
-    license = with licenses; [ mit ];
-    maintainers = teams.rocm.members;
-    platforms = platforms.linux;
-    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
-  };
 })

From 8f7b7b0b4ce3343239d8c7c920b6101d8aee3f6b Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sat, 7 Oct 2023 07:20:04 -0500
Subject: [PATCH 28/30] rocmPackages.half: init at 5.7.0

---
 pkgs/development/rocm-modules/5/default.nix   |  5 +++
 .../rocm-modules/5/half/default.nix           | 39 +++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 pkgs/development/rocm-modules/5/half/default.nix

diff --git a/pkgs/development/rocm-modules/5/default.nix b/pkgs/development/rocm-modules/5/default.nix
index ac8a32ef5176..1e1aa35fa328 100644
--- a/pkgs/development/rocm-modules/5/default.nix
+++ b/pkgs/development/rocm-modules/5/default.nix
@@ -242,6 +242,11 @@ in rec {
     stdenv = llvm.rocmClangStdenv;
   };
 
+  half = callPackage ./half {
+    inherit rocmUpdateScript rocm-cmake;
+    stdenv = llvm.rocmClangStdenv;
+  };
+
   miopen = callPackage ./miopen {
     inherit rocmUpdateScript rocm-cmake rocblas clang-ocl miopengemm composable_kernel rocm-comgr clr rocm-docs-core half;
     inherit (llvm) clang-tools-extra;
diff --git a/pkgs/development/rocm-modules/5/half/default.nix b/pkgs/development/rocm-modules/5/half/default.nix
new file mode 100644
index 000000000000..08c645848fa2
--- /dev/null
+++ b/pkgs/development/rocm-modules/5/half/default.nix
@@ -0,0 +1,39 @@
+{ lib
+, stdenv
+, fetchFromGitHub
+, rocmUpdateScript
+, cmake
+, rocm-cmake
+}:
+
+stdenv.mkDerivation (finalAttrs: {
+  pname = "half";
+  version = "5.7.0";
+
+  src = fetchFromGitHub {
+    owner = "ROCmSoftwarePlatform";
+    repo = "half";
+    rev = "rocm-${finalAttrs.version}";
+    hash = "sha256-82It+/wm8+umBdQYn7lz/fS69h+f0mzwPdGxoJNYUq0=";
+  };
+
+  nativeBuildInputs = [
+    cmake
+    rocm-cmake
+  ];
+
+  passthru.updateScript = rocmUpdateScript {
+    name = finalAttrs.pname;
+    owner = finalAttrs.src.owner;
+    repo = finalAttrs.src.repo;
+  };
+
+  meta = with lib; {
+    description = "C++ library for half precision floating point arithmetics";
+    homepage = "https://github.com/ROCmSoftwarePlatform/half";
+    license = with licenses; [ mit ];
+    maintainers = teams.rocm.members;
+    platforms = platforms.unix;
+    broken = versions.minor finalAttrs.version != versions.minor stdenv.cc.version;
+  };
+})

From 58f70713eba42a8fe73c6c373e7e9b7525a2a69f Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Sun, 8 Oct 2023 18:35:31 -0500
Subject: [PATCH 29/30] top-level/aliases: alias all old ROCm packages

---
 pkgs/top-level/aliases.nix | 52 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/pkgs/top-level/aliases.nix b/pkgs/top-level/aliases.nix
index 4053a661cb11..7f1225a54476 100644
--- a/pkgs/top-level/aliases.nix
+++ b/pkgs/top-level/aliases.nix
@@ -130,7 +130,9 @@ mapAliases ({
   chocolateDoom = chocolate-doom; # Added 2023-05-01
   chrome-gnome-shell = gnome-browser-connector; # Added 2022-07-27
   citra = citra-nightly; # added 2022-05-17
+  clang-ocl = throw "'clang-ocl' has been replaced with 'rocmPackages.clang-ocl'"; # Added 2023-10-08
   inherit (libsForQt5.mauiPackages) clip; # added 2022-05-17
+  composable_kernel = throw "'composable_kernel' has been replaced with 'rocmPackages.composable_kernel'"; # Added 2023-10-08
   cpp-ipfs-api = cpp-ipfs-http-client; # Project has been renamed. Added 2022-05-15
   crispyDoom = crispy-doom; # Added 2023-05-01
   clasp = clingo; # added 2022-12-22
@@ -331,6 +333,18 @@ mapAliases ({
   haxe_3_2 = throw "'haxe_3_2' has been removed because it is old and no longer used by any packages in nixpkgs"; # Added 2023-03-15
   haxe_3_4 = throw "'haxe_3_4' has been removed because it is old and no longer used by any packages in nixpkgs"; # Added 2023-03-15
   hepmc = throw "'hepmc' has been renamed to/replaced by 'hepmc2'"; # Converted to throw 2023-09-10
+  hip = throw "'hip' has been removed in favor of 'rocmPackages.clr'"; # Added 2023-10-08
+  hipcc = throw "'hipcc' has been replaced with 'rocmPackages.hipcc'"; # Added 2023-10-08
+  hipify = throw "'hipify' has been replaced with 'rocmPackages.hipify'"; # Added 2023-10-08
+  hipcub = throw "'hipcub' has been replaced with 'rocmPackages.hipcub'"; # Added 2023-10-08
+  hipsparse = throw "'hipsparse' has been replaced with 'rocmPackages.hipsparse'"; # Added 2023-10-08
+  hipfort = throw "'hipfort' has been replaced with 'rocmPackages.hipfort'"; # Added 2023-10-08
+  hipfft = throw "'hipfft' has been replaced with 'rocmPackages.hipfft'"; # Added 2023-10-08
+  hipsolver = throw "'hipsolver' has been replaced with 'rocmPackages.hipsolver'"; # Added 2023-10-08
+  hipblas = throw "'hipblas' has been replaced with 'rocmPackages.hipblas'"; # Added 2023-10-08
+  hip-amd = throw "'hip-amd' has been removed in favor of 'rocmPackages.clr'"; # Added 2023-10-08
+  hip-common = throw "'hip-common' has been replaced with 'rocmPackages.hip-common'"; # Added 2023-10-08
+  hip-nvidia = throw "'hip-nvidia' has been removed in favor of 'rocmPackages.clr'"; # Added 2023-10-08
   ht-rust = xh; # Added 2021-02-13
   hydra-unstable = hydra_unstable; # added 2022-05-10
 
@@ -403,6 +417,7 @@ mapAliases ({
   latinmodern-math = lmmath;
   ldgallery = throw "'ldgallery' has been removed from nixpkgs. Use the Flake provided by ldgallery instead"; # Added 2023-07-26
   lfs = dysk; # Added 2023-07-03
+  llvmPackages_rocm = throw "'llvmPackages_rocm' has been replaced with 'rocmPackages.llvm'"; # Added 2023-10-08
   libayatana-indicator-gtk3 = libayatana-indicator; # Added 2022-10-18
   libayatana-appindicator-gtk3 = libayatana-appindicator; # Added 2022-10-18
   libbencodetools = bencodetools; # Added 2022-07-30
@@ -526,6 +541,11 @@ mapAliases ({
   meme = meme-image-generator; # Added 2021-04-21
   mess = throw "'mess' has been renamed to/replaced by 'mame'"; # Converted to throw 2023-09-10
   microsoft_gsl = microsoft-gsl; # Added 2023-05-26
+  migraphx = throw "'migraphx' has been replaced with 'rocmPackages.migraphx'"; # Added 2023-10-08
+  miopen = throw "'miopen' has been replaced with 'rocmPackages.miopen'"; # Added 2023-10-08
+  miopengemm = throw "'miopengemm' has been replaced with 'rocmPackages.miopengemm'"; # Added 2023-10-08
+  miopen-hip = throw "'miopen-hip' has been replaced with 'rocmPackages.miopen-hip'"; # Added 2023-10-08
+  miopen-opencl = throw "'miopen-opencl' has been replaced with 'rocmPackages.miopen-opencl'"; # Added 2023-10-08
   mime-types = mailcap; # Added 2022-01-21
   minizip2 = pkgs.minizip-ng; # Added 2022-12-28
   monero = monero-cli; # Added 2021-11-28
@@ -703,10 +723,41 @@ mapAliases ({
   radare2-cutter = cutter; # Added 2021-03-30
   rambox-pro = rambox; # Added 2022-12-12
   rarian = throw "rarian has been removed as unused"; # Added 2023-07-05
+  rccl = throw "'rccl' has been replaced with 'rocmPackages.rccl'"; # Added 2023-10-08
+  rdc = throw "'rdc' has been replaced with 'rocmPackages.rdc'"; # Added 2023-10-08
   retroshare06 = retroshare;
   rigsofrods = rigsofrods-bin; # Added 2023-03-22
   ring-daemon = jami-daemon; # Added 2021-10-26
   rockbox_utility = rockbox-utility; # Added 2022-03-17
+  rocalution = throw "'rocalution' has been replaced with 'rocmPackages.rocalution'"; # Added 2023-10-08
+  rocblas = throw "'rocblas' has been replaced with 'rocmPackages.rocblas'"; # Added 2023-10-08
+  rocfft = throw "'rocfft' has been replaced with 'rocmPackages.rocfft'"; # Added 2023-10-08
+  rocprim = throw "'rocprim' has been replaced with 'rocmPackages.rocprim'"; # Added 2023-10-08
+  rocrand = throw "'rocrand' has been replaced with 'rocmPackages.rocrand'"; # Added 2023-10-08
+  rocsparse = throw "'rocsparse' has been replaced with 'rocmPackages.rocsparse'"; # Added 2023-10-08
+  rocthrust = throw "'rocthrust' has been replaced with 'rocmPackages.rocthrust'"; # Added 2023-10-08
+  roctracer = throw "'roctracer' has been replaced with 'rocmPackages.roctracer'"; # Added 2023-10-08
+  rocwmma = throw "'rocwmma' has been replaced with 'rocmPackages.rocwmma'"; # Added 2023-10-08
+  rocclr = throw "'rocclr' has been removed in favor of 'rocmPackages.clr'"; # Added 2023-10-08
+  rocdbgapi = throw "'rocdbgapi' has been replaced with 'rocmPackages.rocdbgapi'"; # Added 2023-10-08
+  rocgdb = throw "'rocgdb' has been replaced with 'rocmPackages.rocgdb'"; # Added 2023-10-08
+  rocprofiler = throw "'rocprofiler' has been replaced with 'rocmPackages.rocprofiler'"; # Added 2023-10-08
+  rocsolver = throw "'rocsolver' has been replaced with 'rocmPackages.rocsolver'"; # Added 2023-10-08
+  rocmClangStdenv = throw "'rocmClangStdenv' has been moved to 'rocmPackages' and is no longer public"; # Added 2023-10-08
+  rocmUpdateScript = throw "'rocmUpdateScript' has been moved to 'rocmPackages' and is no longer public"; # Added 2023-10-08
+  rocminfo = throw "'rocminfo' has been replaced with 'rocmPackages.rocminfo'"; # Added 2023-10-08
+  rocmlir = throw "'rocmlir' has been replaced with 'rocmPackages.rocmlir'"; # Added 2023-10-08
+  rocmlir-rock = throw "'rocmlir-rock' has been replaced with 'rocmPackages.rocmlir-rock'"; # Added 2023-10-08
+  rocm-cmake = throw "'rocm-cmake' has been replaced with 'rocmPackages.rocm-cmake'"; # Added 2023-10-08
+  rocm-comgr = throw "'rocm-comgr' has been replaced with 'rocmPackages.rocm-comgr'"; # Added 2023-10-08
+  rocm-core = throw "'rocm-core' has been replaced with 'rocmPackages.rocm-core'"; # Added 2023-10-08
+  rocm-device-libs = throw "'rccl' has been replaced with 'rocmPackages.rocm-device-libs'"; # Added 2023-10-08
+  rocm-opencl-icd = lib.warn "'rocm-opencl-icd' has been replaced with 'rocmPackages.clr.icd'" rocmPackages.clr.icd; # Added 2023-10-08
+  rocm-opencl-runtime = lib.warn "'rocm-opencl-runtime' has been replaced with 'rocmPackages.clr'" rocmPackages.clr; # Added 2023-10-08
+  rocm-runtime = throw "'rocm-runtime' has been replaced with 'rocmPackages.rocm-runtime'"; # Added 2023-10-08
+  rocm-smi = throw "'rocm-smi' has been replaced with 'rocmPackages.rocm-smi'"; # Added 2023-10-08
+  rocm-thunk = throw "'rocm-thunk' has been replaced with 'rocmPackages.rocm-thunk'"; # Added 2023-10-08
+  rocr-debug-agent = throw "'rocr-debug-agent' has been replaced with 'rocmPackages.rocr-debug-agent'"; # Added 2023-10-08
   rome = throw "rome is no longer maintained, consider using biome instead"; # Added 2023-09-12
   rpiboot-unstable = rpiboot; # Added 2021-07-30
   rr-unstable = rr; # Added 2022-09-17
@@ -795,6 +846,7 @@ mapAliases ({
   taro = taproot-assets; # Added 2023-07-04
   tdesktop = telegram-desktop; # Added 2023-04-07
   telegram-cli = throw "telegram-cli was removed because it was broken and abandoned upstream"; # Added 2023-07-28
+  tensile = throw "'tensile' has been replaced with 'rocmPackages.tensile'"; # Added 2023-10-08
   testVersion = testers.testVersion; # Added 2022-04-20
   invalidateFetcherByDrvHash = testers.invalidateFetcherByDrvHash; # Added 2022-05-05
   timescale-prometheus = promscale; # Added 2020-09-29

From aeccee810aabf22bc1bdda305e5611e9e61c406e Mon Sep 17 00:00:00 2001
From: Madoura <madouura@gmail.com>
Date: Tue, 10 Oct 2023 01:43:09 -0500
Subject: [PATCH 30/30] python3Packages.torch: Fix 'setuptools' not being found
 with ROCm build

---
 pkgs/development/python-modules/torch/default.nix | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
index c9c400b57bd5..217c43de2327 100644
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@@ -101,6 +101,11 @@ let
       rocminfo rocm-thunk rocm-comgr rocm-device-libs
       rocm-runtime clr.icd hipify
     ];
+
+    # Fix `setuptools` not being found
+    postBuild = ''
+      rm -rf $out/nix-support
+    '';
   };
 
   brokenConditions = attrsets.filterAttrs (_: cond: cond) {